In [1]:
from newsplease import NewsPlease # An integrated web crawler and information extractor for news 
from bs4 import BeautifulSoup # Python library for pulling data out of HTML and XML files
from requests import get # standard for making HTTP requests in Python
import pandas as pd # library written for data manipulation and analysis
import sys #  System-specific parameters and functions

In [2]:
url = 'https://www.thehindu.com/search/?q=Alcoholic%20Anonymous&order=DESC&sort=publishdate&ct=text&s=todays-paper'
response = get(url)
soup = BeautifulSoup(response.text, 'lxml')

# for collecting the total number of article
article_count = int(''.join(i for i in soup.select_one('.section-controls').span.text.split('of')[1] if i.isdigit()))

# to find the no.of pages
max_pages = int((int(article_count)//12) + 2)

In [3]:
%%time
headlines, urls, dates, authors, sections = [], [], [], [], []

for index, i in enumerate(range(1, max_pages)):    
    url = 'https://www.thehindu.com/search/?q=Alcoholic%20Anonymous&order=DESC&sort=publishdate&ct=text&s=todays-paper&page=' + str(i)
    response = get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Extracts url's 
    url = [soup.select('a.story-card75x1-text')[i]['href'] for i in range(len(soup.select('a.story-card75x1-text')))]
    urls.extend(url)
    
    # Extracts headline's
    headline = [soup.select('.story-card75x1-text')[i].text.strip() for i in range(len(soup.select('.section-name')))]
    headlines.extend(headline)
    
    # Extracts author's 
    for i in range(len(soup.select('.story-card-news'))):
        try:
            authors.append(soup.select('.story-card-news')[i].select_one('.story-card-33-author-name').text.strip())
        except:
            authors.append(None)
    
    # Extracts Date's
    date = [soup.select('.dateline')[i].text.strip() for i in range(len(soup.select('.dateline')))]
    dates.extend(date)
    
    # Extracts section's
    section = [soup.select('.section-name')[i].text.strip() for i in range(len(soup.select('.section-name')))]
    sections.extend(section)
    
    sys.stdout.write('\r' + str(index) + ' : ' + str(max_pages))
    sys.stdout.flush()

884 : 885Wall time: 26min 21s


In [4]:
%%time
meetings, news = [], []

for index, url in enumerate(urls):
    try:
        # Parse the url to NewsPlease 
        article = NewsPlease.from_url(url, timeout=6)

        try:
            # Extracts the Headlines 
            news.append(article.text)
        except:
            news.append(None)

        try:
            # Extracts the AA meetings if available
            meetings.append(article.text.split('Alcoholics')[-1])
        except:
            meetings.append(None)
            
    except:
            news.append(None)
            meetings.append(None)
        

    sys.stdout.write('\r' + str(index) + ' : ' + str(url) + '\r')
    sys.stdout.flush()

Wall time: 7h 35min 54shindu.com/todays-paper/tp-miscellaneous/in-madurai-today/article3233838.ece3235280.ece46450.ece9.ecece212951.ecele3048267.ece7569.eceecece74.ece


In [6]:
print(len(headlines), len(meetings), len(authors), len(dates), len(sections), len(news), len(urls))

10616 10616 10616 10616 10616 10616 10616


In [7]:
tbl = pd.DataFrame({'Headline' : headlines,
                    'Author' : authors,
                    'Published_Region' : sections,
                    'Published_Date' : dates, 
                    'Meetings' : meetings, 
                    'Main_Article' : news,
                    'Source_urls' : urls})
tbl.head()

Unnamed: 0,Headline,Author,Published_Region,Published_Date,Meetings,Main_Article,Source_urls
0,Madurai Today,Special Correspondent,TAMIL NADU,"April 14, 2019",Anonymous: Meeting; U. C. Higher Secondary Sc...,RELIGION\nSelva Vinayagar Temple: Panchangam r...,https://www.thehindu.com/todays-paper/tp-natio...
1,Madurai Today,Special Correspondent,TAMIL NADU,"April 13, 2019","Anonymous: Meeting; Ahana Hospital, Anna Bus ...",RELIGION\nMeenakshi Sundareswarar Temple: Chit...,https://www.thehindu.com/todays-paper/tp-natio...
2,Engagements,,TAMIL NADU,"April 13, 2019",,,https://www.thehindu.com/todays-paper/tp-natio...
3,Madurai Today,,TAMIL NADU,"April 12, 2019","Anonymous: Meeting; Tamilarasi School, Melur,...",RELIGION\nMeenakshi Sundareswarar Temple: Chit...,https://www.thehindu.com/todays-paper/tp-natio...
4,engagements,,TAMIL NADU,"April 12, 2019",,,https://www.thehindu.com/todays-paper/tp-natio...


In [8]:
tbl.to_excel('The_Hindu_AA_Meetings.xls', index=False)
tbl.to_csv('The_Hindu_AA_Meetings.csv', index=False)