In [1]:
from bs4 import BeautifulSoup
from requests import get
import pandas as pd
from tqdm import tqdm
from dateutil import parser
import string
import re

In [35]:
SearchString = "ICICI%20BANK"

url = "https://www.thehindu.com/search/?q=" + SearchString + "&order=DESC&sort=publishdate&pd=year&ct=text&s=business&page="

soup = BeautifulSoup(get(url + "1").text, 'lxml')
result_max = max([int(script.get('data-page-no')) for script in soup.select('a.page-link')])

# Total number of result pages
print(result_max)

37


In [36]:
url_all = [url + str(i) for i in range(1, result_max+1)]
headlines, sections, dates, news, urls, authors = [], [], [], [], [], []

In [37]:
print("[INFO] Extracting links...")

for src in tqdm(url_all):

    try:
        soup = BeautifulSoup(get(src).text, 'lxml')

        # Extracts the Headlines
        try:
            headline = [script.text.strip() for script in soup.select('a.story-card75x1-text')]
            headlines.extend(headline)
        except:
            headlines.extend(None)

        # Extracts the urls
        try:
            source = [script.get('href') for script in soup.select('a.story-card75x1-text')]
            urls.extend(source)
        except:
            urls.extend(None)

        # Extracts the sections(markets, industry, business etc.)
        try:
            section = [script.text.strip() for script in soup.select('a.section-name')]
            sections.extend(section)
        except:
            sections.extend(None)

        # Extracts the published dates
        try:
            dateline = [str(parser.parse(script.text)).split()[0] for script in soup.select('span.dateline')]
            dates.extend(dateline)
        except:
            dates.extend(None)

        # Extracts the bylines
#         try:
#             bylines = [script.text.strip() for script in soup.select('a.story-card-33-author-name')]
#             authors.extend(bylines)
#             assert len(bylines)==12
#         except:
#             authors.extend(None)

    except:
        print("Exception occurred in url : ", src)
        break

print("[INFO] Links Extracted.")

  0%|                                                                                           | 0/37 [00:00<?, ?it/s]

[INFO] Extracting links...


100%|██████████████████████████████████████████████████████████████████████████████████| 37/37 [01:00<00:00,  1.65s/it]

[INFO] Links Extracted.





In [38]:
print("Total no. of pages scraped = ", len(urls))
print("Oldest available article: ", min(dates))

Total no. of pages scraped =  443
Oldest available article:  2019-10-22


In [39]:
print("[INFO] Extracting articles...")

for src in tqdm(urls):
    try:
        # Parse the url to NewsPage
        soup = BeautifulSoup(get(src).text, 'lxml')

        # Extracts the news articles
        try:
            news_article = soup.find(id='content-body-14269002-' + re.findall(r"\d+",src.split('/')[-1])[0]).text.strip()
            news.append(news_article)
        except:
            news.append(None)

        # Extracts the bylines
        try:
            bylines = [script.text.strip() for script in soup.select('a.auth-nm')]
            authors.extend([' | '.join(bylines)])
        except:
            authors.extend(None)

    except:
        print("Exception occurred in url : ", src)
        news.append(None)

print("[INFO] Articles Extracted.")

  0%|                                                                                          | 0/443 [00:00<?, ?it/s]

[INFO] Extracting articles...


100%|████████████████████████████████████████████████████████████████████████████████| 443/443 [05:19<00:00,  1.39it/s]

[INFO] Articles Extracted.





In [40]:
df = pd.DataFrame({'Headlines': headlines,
                   'Sections' : sections,
                   'Articles': news,
                   'Published_Dates': dates,
                   'Source_URLs': urls,
                   'ByLines' : authors
                   })

# Checking for any missing values in the Dataframe
print(df.isna().sum())
df.head()

Headlines            0
Sections             0
Articles           117
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64


Unnamed: 0,Headlines,Sections,Articles,Published_Dates,Source_URLs,ByLines
0,Sensex ends 113 points higher; HCL Tech spurts 4%,Markets,Equity benchmarks Sensex and Nifty notched up ...,2020-10-20,https://www.thehindu.com/business/markets/sens...,PTI
1,Sensex rises over 150 pts in early trade; Nift...,Markets,Equity benchmark Sensex advanced over 150 poin...,2020-10-20,https://www.thehindu.com/business/markets/sens...,PTI
2,"Today's top business news: Stocks up, retail i...",Business,,2020-10-20,https://www.thehindu.com/business/businesslive...,The Hindu Net Desk
3,"Sensex rallies 449 pts; Nifty tops 11,850",Markets,Equity benchmark Sensex rallied 449 points on ...,2020-10-19,https://www.thehindu.com/business/markets/sens...,PTI
4,Sensex rallies over 300 points in early trade;...,Markets,Equity benchmark Sensex rallied over 300 point...,2020-10-19,https://www.thehindu.com/business/markets/sens...,PTI


In [46]:
# Dropping all the rows with empty values in any of the columns
df=df.dropna(axis = 0)
print("Total Data Scraped: ", df.shape)

Total Data Scraped:  (326, 6)


In [49]:
sname = '_'.join(SearchString.split('%20'))
# df.to_csv("news_thehindu_"+ sname + ".csv")
df.to_pickle("news_thehindu_"+ sname + ".pkl")