# Ramsey King
# DSC 680 - Applied Data Science
# April 9 - 2022

###  This Jupyter file will pull the talk information down from the websites speeches.byu.edu and www.churchofjesuschrist.org/study/general-conference/speakers

In [33]:
# Retrieve talks by speaker on speeches.byu.edu using BeautifulSoup

from bs4 import BeautifulSoup as bs
import requests
# import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [34]:
# Get list of urls for each of the topics
# For my reference, number of talks by speaker:  Nelson - 22, Maxwell - 30, Holland - 27, Hinckley - 41, Monson - 14, Oaks - 35, Eyring - 25 (Total - 194)
url = "https://speeches.byu.edu/speakers/"

url_list = [
    'russell-m-nelson/', 'neal-a-maxwell/', 'jeffrey-r-holland/', 'gordon-b-hinckley/', 'thomas-s-monson/',
    'dallin-h-oaks', 'henry-b-eyring'
]

links = []
new_links = []

# some of the links are repeated with '?M=V' and '?M=A' and those will be removed so we can have a unique set of links to pull talk information from.

for u in url_list:
    response = requests.get(url+u)
    html = response.content
    video = '?M=V'
    soup = bs(html, 'lxml')
    for link in soup.findAll("a", attrs={'href': re.compile("^https://speeches.byu.edu/talks/[\D+]")}):
        links.append(link.get('href').replace(video, ''))

    audio = '?M=A'
    for link in links:
        new_link = link.replace(audio, '')
        new_links.append(new_link)
    new_links = list(set(new_links))



In [35]:
len(new_links)

195

In [36]:
# We may have one duplicate, so something to be aware when we pull the talk data.

'''Retrieves talk text from each of the urls using beautiful soup.  Information such as the talk/sermon text, title, topic, date, and speaker will be retreived
and put into a dataframe.  A csv file will also be saved due to the time it takes to run or in case we need to work offline.'''
talk_text = []
title_text = []
topics = []
dates = []
speakers = []
speaker_position = []

for link in new_links:
    url = link
    response = requests.get(url)
    html = response.content
    soup = bs(html, 'lxml')
    temp_list = []

    link_splitter = link.split('/')
    speakers.append(link_splitter[4].replace('-', ' '))

    result = soup.find('p', class_='single-speech__speaker-subtext single-speech__speaker-position')
    speaker_position.append(result.text if result else 'Title Not Found')

    for div in soup.find_all('div', class_='single-speech__content'):
        talk_text.append(div.get_text(strip=True))
    '''for h1 in soup.find_all('h1', class_='single-speech__title'):
        title_text.append(h1.get_text(strip=True))
    for p in soup.find_all('p', class_='single-speech__speaker-subtext single-speech__date'):
        dates.append(p.get_text(strip=True))
    for p in soup.find_all('p', class_='single-speech__related-tag'):
        temp_list.append(p.get_text(strip=True))
        temp_str = ','.join(temp_list)
    topics.append(temp_str)'''


# dict = {'titles': title_text , 'speaker': speakers, 'speaker_position': speaker_position , 'date': dates, 'talks': talk_text, 'topics': topics, }
dict_speeches_byu = {'speaker': speakers,'talks': talk_text}
df = pd.DataFrame(dict_speeches_byu)
# df.to_csv('talks.csv')

df.sample(30)

Unnamed: 0,speaker,talks
135,gordon b hinckley,The text for this speech is unavailable. Pleas...
69,jeffrey r holland,The Duration of the WarIn the final few weeks ...
143,jeffrey r holland,This responsibility to speak to you never gets...
20,dallin h oaks,One of the principles of the BYU Code of Honor...
154,russell m nelson,"Thank you, Elder Oaks, for your very important..."
107,dallin h oaks,Imagine the emotions I feel as I stand before ...
67,thomas s monson,The text for this speech is unavailable. Pleas...
49,thomas s monson,"I drove here today in a snowstorm, and my thou..."
131,jeffrey r holland,"In general conference of October 2016, I told ..."
6,dallin h oaks,My fellow students: I have spoken to BYU audie...


In [37]:
# Now to pull information from the church general conference website.

# Get list of urls for each of the topics
# For my reference, number of talks by speaker:  Nelson - , Maxwell - , Holland - , Hinckley - , Monson - , Oaks - , Eyring -  (Total - )
gc_url = "https://www.churchofjesuschrist.org/study/general-conference/speakers/"

gc_url_list = [
    'russell-m-nelson/', 'neal-a-maxwell/', 'jeffrey-r-holland/', 'gordon-b-hinckley/', 'thomas-s-monson/',
    'dallin-h-oaks', 'henry-b-eyring'
]

links = []

for u in gc_url_list:
    response = requests.get(gc_url+u)
    html = response.content
    soup = bs(html, 'lxml')
    for link in soup.findAll("a", attrs={'href': re.compile("^/study/general-conference/\w+/(?:\w+/)(?:\w+)")}):
        links.append(link.get('href'))

print(links)



In [38]:
talk_text = []
speakers = []

for link in links:
    url = 'https://www.churchofjesuschrist.org'+link
    response = requests.get(url)
    html = response.content
    soup = bs(html, 'lxml')
    '''temp_list = []
    link_splitter = link.split('/')
    speakers.append(link_splitter[4].replace('-', ' '))
'''
    sermon_name = soup.find_all('div', class_='byline')
    for x in sermon_name:
        # print(x.find('p').text)
        if 'nelson' in x.find('p').text.lower():
            speakers.append('russell m nelson')
        elif 'maxwell' in x.find('p').text.lower():
            speakers.append('neal a maxwell')
        elif 'holland' in x.find('p').text.lower():
            speakers.append('jeffrey r holland')
        elif 'hinckley' in x.find('p').text.lower():
            speakers.append('gordon b hinckley')
        elif 'monson' in x.find('p').text.lower():
            speakers.append('thomas s monson')
        elif 'oaks' in x.find('p').text.lower():
            speakers.append('dallin h oaks')
        else:
            speakers.append('henry b eyring')

    # for div in soup.find_all('div', class_='byline'):
    #     print(div.get_text())

    for div in soup.find_all('div', class_='body-block'):
        talk_text.append(div.get_text(separator= ' ', strip=True))

dict_gc = {'speaker': speakers,'talks': talk_text}
df_gc = pd.DataFrame(dict_gc)



'''russell-m-nelson/', 'neal-a-maxwell/', 'jeffrey-r-holland/', 'gordon-b-hinckley/', 'thomas-s-monson/',
    'dallin-h-oaks', 'henry-b-eyring'
]'''

'''
df.to_csv('talks.csv')

df.head()'''
df_gc.sample(30)

Unnamed: 0,speaker,talks
544,henry b eyring,It is proposed that we sustain Thomas Spencer ...
123,neal a maxwell,This is an appropriate moment to thank Elder H...
572,henry b eyring,"My beloved brothers and sisters, it is a joy t..."
193,jeffrey r holland,Surely and steadily The Church of Jesus Christ...
515,henry b eyring,"Brothers and sisters, I will now present to yo..."
98,russell m nelson,If our faith be united in prayer that we may b...
121,neal a maxwell,Teaching about history’s major apostasies has ...
517,henry b eyring,"My beloved brethren, I am grateful for the pri..."
590,henry b eyring,The world seems to be in commotion. There are ...
177,jeffrey r holland,Prophecies regarding the last days often refer...


In [39]:
df = df.append(df_gc)
df.to_csv('talk_corpus.csv')
df.sample(50)

Unnamed: 0,speaker,talks
151,dallin h oaks,I am going to speak this morning about revelat...
559,henry b eyring,"My beloved young sisters, you are the bright h..."
363,thomas s monson,This has been a wonderful closing session. I’v...
160,neal a maxwell,What a delightful and moving musical presentat...
593,henry b eyring,Hundreds of thousands of people in the last ye...
489,dallin h oaks,“What think ye of Christ?” ( Matt. 22:42 .) Th...
255,gordon b hinckley,"Now my beloved brethren, I speak with a desire..."
591,henry b eyring,I am grateful for the honor of speaking in the...
351,thomas s monson,"My beloved brothers and sisters, I extend my l..."
392,thomas s monson,"Some years ago as our youngest son, Clark, was..."
