### Libraries

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Functions

In [3]:
def get_media(link):    
    url = f"https://www.gracecovenantbaptist.org/{link['link']}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text)
    ps = soup.find_all("div", {"class": "entry-content"})[0].find_all("p")
    dates = ps[0].text.splitlines()
    speakers = ps[1].text.splitlines()
    sermons = ps[2].text.splitlines()    
    brs = str(ps[2]).split('<br/>')
    links = []
    handouts = []
    for br in brs:

        try:
            temp = BeautifulSoup(br).find_all('a')[0]['href']            
            if temp[-3:] == 'mp3':
                links.append(temp)
            else:
                handouts.append(temp)            
                links.append('')
        except:
            links.append('')
            handouts.append('')
        try:
            temp = BeautifulSoup(br).find_all('a')[1]['href']            
            handouts.append(temp)            
        except:
            handouts.append('')


    df = pd.DataFrame(list(zip(dates, speakers, sermons, links, handouts)), columns=['date', 'speaker', 'title', 'audioLink', 'handoutLink'])
    df['testament'] = link['testament']
    df['book'] = link['book']
    df['date_str'] = df.date.apply(lambda x: x[-4:]+"-"+x[:2]+"-"+x[3:5])
    df['year'] = df.date.apply(lambda x: x[-4:])
    return df

In [4]:
def get_ss(link):
    if link['format'] == 'old':
        url = f"https://www.gracecovenantbaptist.org/{link['link']}"
        response = requests.get(url)
        soup = BeautifulSoup(response.text)
        lessons = []
        ps = soup.find_all("div", {"class": "entry-content"})[0].find_all("p")
        for i,p in enumerate(ps):        
            if len(str(p))>10:
                date = [x for x in str(p).split('<a')[0].replace('\xa0','').split('<p>')[1].strip().split('  ') if x != ''][0]
                speaker = [x for x in str(p).split('<a')[0].replace('\xa0','').split('<p>')[1].strip().split('  ') if x != ''][1]
                try:
                    link2 = str(p).split('<a')[1].split('href="')[1].split('"')[0]
                    title = str(p).split('<a')[1].split('">')[1].split('</p>')[0].replace('</a>',' ').replace('\xa0','')
                except:
                    link2 = ''
                    title = [x for x in str(p).split('<a')[0].replace('\xa0','').split('<p>')[1].strip().split('  ') if x != ''][2].split('</p')[0]
                
                lesson = {'date': date, 'speaker': speaker, 'title': title, 'audioLink': link2, 'book': link['book'], 'testament': link['testament']}
                lessons.append(lesson)
        df = pd.DataFrame(lessons)      
        df['date_str'] = df.date.apply(lambda x: x[-4:]+"-"+x[:2]+"-"+x[3:5])
        df['year'] = df.date.apply(lambda x: x[-4:])
        return df
    else:
        return get_media(link)    

### Sermons

In [5]:
sermon_links = [
    {'testament': 'NT', 'book': 'John', 'link':'2013/06/22/sermons-in-the-gospel-of-john/'},
    {'testament': 'NT', 'book': 'Mark', 'link':'2013/06/22/sermons-in-mark/'},
    {'testament': 'NT', 'book': '1 Corinthians', 'link':'2013/06/22/sermons-in-1-corinthians/'},
    {'testament': 'NT', 'book': 'Galatians', 'link':'2013/06/22/sermons-in-galatians/'},
    {'testament': 'NT', 'book': 'Ephesians', 'link':'2016/10/03/ephesians/'},
    {'testament': 'NT', 'book': 'Philippians', 'link':'2022/06/06/sermons-in-philippians/'},
    {'testament': 'NT', 'book': 'Philemon', 'link':'2018/04/16/sermons-on-philemon/'},
    {'testament': 'NT', 'book': 'Hebrews', 'link':'2018/07/31/sermons-in-hebrews/'},
    {'testament': 'NT', 'book': '1 Peter', 'link':'2013/06/22/sermons-in-1-peter/'},
    {'testament': 'NT', 'book': '2 Peter and Jude', 'link':'2016/01/12/2-peterjude/'},
    {'testament': 'NT', 'book': '1-3 John', 'link':'2013/06/22/sermons-in-the-epistles-of-john/'},
    {'testament': 'NT', 'book': 'Revelation', 'link':'2013/06/22/sermons-in-revelation/'},
    {'testament': 'OT', 'book': 'Ruth', 'link':'2013/06/22/sermon-ruth/'},
    {'testament': 'OT', 'book': 'Esther', 'link':'2013/06/22/sermons-on-esther/'},
    {'testament': 'OT', 'book': 'Psalms', 'link':'2013/06/22/164/'},
    {'testament': 'OT', 'book': 'Ecclesiastes', 'link':'2015/08/11/ecclesiastes/'},
    {'testament': 'OT', 'book': 'Jonah', 'link':'2013/06/22/sermons-in-jonah/'},
    {'testament': 'OT', 'book': 'Habakkuk', 'link':'2013/06/22/sermons-in-habakkuk/'}, 
]

In [6]:
# takes about 20 seconds
df1 = pd.concat([get_media(link) for link in sermon_links]).reset_index(drop=True)

In [7]:
df1 = df1.sort_values('date_str')
df1['category'] = 'sermon'

### Luke (diff format)

In [8]:
url = 'https://www.gracecovenantbaptist.org/2020/09/14/sermons-in-luke-2/'
response = requests.get(url)
soup = BeautifulSoup(response.text)

In [9]:
date_divs = soup.find_all("div", {"class": "entry-content"})[0].find_all('div', attrs={'style': 'width: 15%; padding: 0 10px 0 0; float: left;'})
dates = []
for div in date_divs:
    dates.extend(div.text.splitlines())
dates = [x for x in dates if len(x)==10]

speaker_divs = soup.find_all("div", {"class": "entry-content"})[0].find_all('div', attrs={'style': 'width: 19%; padding: 0 10px 0 0; float: left;'})
speakers = []
for div in speaker_divs:
    speakers.extend(div.text.splitlines())
speakers = [x for x in speakers if x not in ['Speaker','']]
speakers = speakers[:-1]

title_divs = soup.find_all("div", {"class": "entry-content"})[0].find_all('div', attrs={'style': 'width: 66%; padding: 0 10px 0 0; float: right;'})
links_ = []
for div in title_divs:
    links_.extend(div.find_all('a'))
links_temp = []
for link in links_:
    try:
        links_temp.append(link['href'])
    except:
        links_temp.append('')
links = links_temp[:-1] + ['', '', ''] + links_temp[-1:]        

titles = []
for div in title_divs:
    titles.extend(div.text.splitlines())
titles = [x for x in titles if len(x)>10]

In [10]:
luk = pd.DataFrame(list(zip(dates, speakers, titles, links)), columns=['date', 'speaker', 'title', 'audioLink',])
luk['testament'] = 'NT'
luk['book'] = 'Luke'
luk['date_str'] = luk.date.apply(lambda x: x[-4:]+"-"+x[:2]+"-"+x[3:5])
luk['year'] = luk.date.apply(lambda x: x[-4:])
luk['category'] = 'sermon'
luk['handoutLink'] = ''
luk = luk.sort_values('date_str').reset_index(drop=True)

In [11]:
sermon_df = pd.concat([df1, luk])

In [12]:
sermon_df = sermon_df.sort_values('date_str').reset_index(drop=True)

### Sunday School

In [13]:
ss_links = [
    {'format':'old', 'testament': 'OT', 'book': 'Genesis', 'link':'2013/07/19/genesis/'},
    {'format':'old', 'testament': 'OT', 'book': 'Exodus', 'link':'2013/07/19/exodus/'},
    {'format':'old', 'testament': 'OT', 'book': 'Leviticus and Numbers', 'link':'2013/07/19/leviticusnumbers/'},    
    {'format':'old', 'testament': 'OT', 'book': 'Deuteronomy', 'link':'2013/07/19/deuteronomy/'},    
    {'format':'old', 'testament': 'OT', 'book': 'Joshua', 'link':'2013/07/19/joshua/'},
    {'format':'old', 'testament': 'OT', 'book': 'Judges', 'link':'2013/07/19/judges/'},
    {'format':'old', 'testament': 'OT', 'book': 'Ruth', 'link':'2013/07/19/ruth/'},
    {'format':'old', 'testament': 'OT', 'book': '1 Samuel', 'link':'2013/07/19/1-samuel/'},
    {'format':'old', 'testament': 'OT', 'book': '2 Samuel', 'link':'2013/07/19/2-samuel/'},
    {'format':'new', 'testament': 'OT', 'book': '1 Kings', 'link':'2013/07/19/1-kings/'},
    {'format':'new', 'testament': 'OT', 'book': '2 Kings', 'link':'2013/07/19/2-kings/'},    
    {'format':'old', 'testament': 'OT', 'book': 'Ezra and Nehemiah and Esther', 'link':'2013/07/19/lessons-in-ezranehemiahesther/'},    
    {'format':'new', 'testament': 'OT', 'book': 'Isaiah', 'link':'2014/09/16/isaiah/'},    
    {'format':'new', 'testament': 'OT', 'book': 'Jeremiah', 'link':'2016/05/06/studies-in-jeremiah/'},
    {'format':'old', 'testament': 'OT', 'book': 'Daniel', 'link':'2013/07/19/daniel/'},
    {'format':'new', 'testament': 'OT', 'book': 'Minor Prophets', 'link':'2013/07/19/minor-prophets/'},

    {'format':'new', 'testament': 'NT', 'book': 'Matthew', 'link':'2014/01/06/the-sermon-on-the-mount/'},
    {'format':'new', 'testament': 'NT', 'book': 'Incarnation Studies', 'link':'2013/07/19/studies-on-the-incarnation/'},
    {'format':'new', 'testament': 'NT', 'book': 'The One Anothers', 'link':'2013/07/19/one-anothers-in-the-new-testament/'},    
    {'format':'new', 'testament': 'NT', 'book': 'Peter the Apostle', 'link':'2014/08/19/peter-the-apostle/'},
    {'format':'new', 'testament': 'NT', 'book': 'Acts', 'link':'2013/07/19/acts/'},
    {'format':'new', 'testament': 'NT', 'book': 'Philippians', 'link':'2014/06/02/philippians/'},
    {'format':'new', 'testament': 'NT', 'book': 'Hebrews', 'link':'2013/07/19/hebrews/'},
    {'format':'new', 'testament': 'NT', 'book': '2 Timothy', 'link':'2017/06/13/2-timothy/'},
    {'format':'new', 'testament': 'NT', 'book': 'Single Chapter Books', 'link':'2013/07/19/philemon-23-john-jude/'},

    {'format':'new', 'testament': '', 'book': 'Gods Glory in Gods Word', 'link':'2019/11/16/gods-glory-in-gods-word/'},
    {'format':'new', 'testament': '', 'book': 'How to Study the Bible', 'link':'2019/05/22/how-to-study-your-bible/'},
    {'format':'new', 'testament': 'OT', 'book': 'Genesis', 'link':'2017/08/03/biblical-christian-worldview/'},
    {'format':'new', 'testament': '', 'book': 'Doctrinal Distinctives: Baptist and Reformed', 'link':'2018/06/17/doctrinal-distinctives-baptist-reformed-2/'},
    {'format':'new', 'testament': '', 'book': 'Getting the Most out of the Word Taught', 'link':'2017/03/24/how-to-listen-to-a-sermon/'},
    
]


In [14]:
# takes about 30 seconds
sunday_school = pd.concat([get_ss(link) for link in ss_links]).reset_index(drop=True)

In [15]:
sunday_school = sunday_school.sort_values('date_str')
sunday_school['category'] = 'sunday school'


### Other Sermons (Topical)

In [16]:
topical_links = [
    {'format':'new', 'testament': '', 'book': 'The Doctrines of Grace', 'link':'2013/07/19/the-doctrines-of-grace/'},    
    {'format':'new', 'testament': '', 'book': 'The Seven Sayings of Christ on the Cross', 'link':'2015/01/20/the-seven-sayings-of-christ-on-the-cross/'},
    {'format':'new', 'testament': '', 'book': 'Sermons on the Kingdom of God', 'link':'2014/06/02/sermons-on-the-kingdom-of-god/'},
    {'format':'new', 'testament': '', 'book': 'Sermons on Biblical Manhood and Womanhood', 'link':'2014/05/20/sermons-on-biblical-manhood-womanhood/'},
    {'format':'new', 'testament': '', 'book': 'Church and Doctrinal Issues', 'link':'2013/06/24/church-and-doctrinal-issues/'},        
    {'format':'new', 'testament': '', 'book': 'Sermons on the Glory of God', 'link':'2013/06/24/gods-passion-for-his-own-glory/'},
    {'format':'new', 'testament': '', 'book': 'Sermons on Atonement', 'link':'2013/10/02/sermons-on-atonement/'},
    {'format':'new', 'testament': '', 'book': 'Spiritual Disciplines of the Christian Life', 'link':'2013/06/24/disciplines-for-the-christian-life/'},
    {'format':'new', 'testament': '', 'book': 'Stewardship', 'link':'2013/07/19/sermons-on-stewardship/'},
    {'format':'new', 'testament': '', 'book': "The Apostles' Creed", 'link':'2013/07/19/the-apostles-creed/'},
    {'format':'new', 'testament': '', 'book': "The Doctrine of Adoption", 'link':'2013/07/19/sermons-on-adoption-in-christ/'},
    {'format':'new', 'testament': '', 'book': "Sermons on the Church", 'link':'2013/07/19/sermons-on-the-church/'},
    {'format':'new', 'testament': '', 'book': "Sermons on the Sermon on The Mount", 'link':'2013/07/19/sermons-on-the-sermon-on-the-mount-2/'},
    
]
## diff format
## 2010 summer sermon series

In [17]:
# takes about 12 seconds
topical = pd.concat([get_ss(link) for link in topical_links]).reset_index(drop=True)

In [18]:
topical = topical.sort_values('date_str')
topical['category'] = 'sermon'

### Combine Into One File

In [19]:
df = pd.concat([sermon_df, sunday_school, topical]).reset_index(drop=True)

In [20]:
def fix_link(x):    
    try:    
        link = left = 'https://www.gracecovenantbaptist.org' + x.split('.org')[1]
    except:
        link = ''
    return link
df['audioLink'] = df.audioLink.apply(lambda x: fix_link(x))

In [21]:
df.to_pickle("../data/teaching.pkl")

### Get mp3 Files

In [22]:
# took about an hour (1320 links)
audioLinks = [x for x in list(df['audioLink']) if x != '']
for i,audioLink in enumerate(audioLinks):    
    fileName = audioLink.split('.org/')[-1].replace('/','___')
    mp3 = requests.get(audioLink)
    with open(f'../mp3s/{fileName}', 'wb') as f:
        f.write(mp3.content)