In [1]:
from bs4 import BeautifulSoup
from requests import get
import pandas as pd
from tqdm import tqdm
from dateutil import parser
import string
import re
import time

In [62]:
# Scrape the latest Nifty50 company names, prices, change etc.
# At the moment, this is only scraping info required for news scraping, i'll scrape prices etc. at a later stage.

getnifty50 = "https://www.moneycontrol.com/stocks/marketstats/indcontrib.php?optex=NSE&opttopic=indcontrib&index=9"
soup = BeautifulSoup(get(getnifty50).text, 'lxml')
Company_Name = [script.text.strip() for script in soup.select("a.bl_12")[0::2]]
Industry = [script.text.strip() for script in soup.select("a.bl_12")[1::2]]

urlsplit1 = [script.get('href').split('/')[-2] for script in soup.select("a.bl_12")[0::2]]
urlsplit2 = [script.get('href').split('/')[-1] for script in soup.select("a.bl_12")[0::2]]

nifty50_lookuptable = pd.DataFrame({
    'Company_Name' : Company_Name,
    'Industry' : Industry,
    'urlsplit1' : urlsplit1,
    'urlsplit2' : urlsplit2
})

print(nifty50_lookuptable.shape)
nifty50_lookuptable.head()

(50, 4)


Unnamed: 0,Company_Name,Industry,urlsplit1,urlsplit2
0,HDFC Bank,Bank - Private,hdfcbank,HDF01
1,HDFC,Finance - Housing,hdfc,HDF
2,ICICI Bank,Bank - Private,icicibank,ICI02
3,Infosys,IT Services & Consulting,infosys,IT
4,Kotak Mahindra,Bank - Private,kotakmahindra,KMB


In [134]:
def initialize(urlsplit2):
    '''
    Function to obtain total number of result pages, initialize blank news data and
    set urls for moneycontrol news search page for 2020 and 2019.
    '''
    global ticker, url_all, headlines, dates, news, urls, sources
    
    url = "https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id={}&scat=&pageno={}&next=0&durationType=Y&Year={}&duration=1&news_type="
    url19 = "https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id=" + urlsplit2 + "&durationType=Y&Year=2019"
    url20 = "https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id=" + urlsplit2 + "&durationType=Y&Year=2020"
    
    soup = BeautifulSoup(get(url19).text, 'lxml')
    result_max = [len(soup.select('div.pages.MR10.MT15')[0].select('a')) + 1]
    
    soup = BeautifulSoup(get(url20).text, 'lxml')
    ticker = soup.select('div.FL.gry10')[0].text.split('|')[1].split(':')[1].strip()
    result_max += [len(soup.select('div.pages.MR10.MT15')[0].select('a')) + 1]
    
    url_all = [url.format(ticker, i, 2019) for i in range(1, result_max[0]+1)] + [url.format(ticker, i, 2020) for i in range(1, result_max[0]+1)]
    headlines, dates, news, urls, sources = [], [], [], [], []
    print("Total number of result pages for", ticker, ":", result_max)

In [None]:
# Yet to update the below codes for MoneyControl. Above part of the notebook code is already done.

In [4]:
def getnewslinks():
    '''
    Function to scrape news headlines, urls, publish dates etc.
    '''
    print("[INFO] Extracting Links...")

    for src in tqdm(url_all):

        try:
            soup = BeautifulSoup(get(src).text, 'lxml')

            # Extracts the Headlines
            try:
                headline = [script.text.strip() for script in soup.select('a.story-card75x1-text')]
                headlines.extend(headline)
            except:
                headlines.extend(None)

            # Extracts the urls
            try:
                source = [script.get('href') for script in soup.select('a.story-card75x1-text')]
                urls.extend(source)
            except:
                urls.extend(None)

            # Extracts the sections(markets, industry, business etc.)
            try:
                section = [script.text.strip() for script in soup.select('a.section-name')]
                sections.extend(section)
            except:
                sections.extend(None)

            # Extracts the published dates
            try:
                dateline = [str(parser.parse(script.text)).split()[0] for script in soup.select('span.dateline')]
                dates.extend(dateline)
            except:
                dates.extend(None)

            # Extracts the bylines
    #         try:
    #             bylines = [script.text.strip() for script in soup.select('a.story-card-33-author-name')]
    #             authors.extend(bylines)
    #             assert len(bylines)==12
    #         except:
    #             authors.extend(None)

        except:
            print("Exception occurred in url : ", src)
            break

    print("[INFO] Links Extracted.")
    print("Total No. of Pages to be Scraped = ", len(urls))
    print("Oldest Available Article: ", min(dates))

In [5]:
def getarticles():
    '''
    Function to scrape news articles and bylines.
    '''
    print("[INFO] Extracting Articles...")

    for src in tqdm(urls):
        try:
            # Parse the url to NewsPage
            soup = BeautifulSoup(get(src).text, 'lxml')

            # Extracts the news articles
            try:
                news_article = soup.find(id='content-body-14269002-' + re.findall(r"\d+",src.split('/')[-1])[0]).text.strip()
                news.append(news_article)
            except:
                news.append(None)

            # Extracts the bylines
            try:
                bylines = [script.text.strip() for script in soup.select('a.auth-nm')]
                authors.extend([' | '.join(bylines)])
            except:
                authors.extend(None)

        except:
            print("Exception occurred in url : ", src)
            news.append(None)

    print("[INFO] Articles Extracted.")

In [6]:
def chkdata():
    '''
    Function to check for any missing values in the Dataframe and drop it.
    '''
    global df
    df = pd.DataFrame({'Headlines': headlines,
                       'Sections' : sections,
                       'Articles': news,
                       'Published_Dates': dates,
                       'Source_URLs': urls,
                       'ByLines' : authors
                       })
    print("Missing Info in Scraped Data :")
    print(df.isna().sum())
    df=df.dropna(axis = 0)
    print("Total Usable Scraped Data : ", df.shape)

In [7]:
def savefile(SearchString):
    '''
    Function to save the scraped data as pickle file.
    '''
    sname = '_'.join(SearchString.split('%20'))
    # df.to_csv("news_thehindu_"+ sname + ".csv")
    df.to_pickle("news_thehindu_"+ sname + ".pkl")
    print("Data saved for", SearchString, ".")

In [None]:
# Scraping news articles for all the companies listed in Nifty50

for i, sstring in enumerate(SearchString):
    print("Nifty50 Extraction Search Count :",i+1)
    initialize(sstring)
    getnewslinks()
    getarticles()
    chkdata()
    savefile(sstring)
    time.sleep(5)

Nifty50 Extraction Search Count : 1


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Total number of result pages for reliance%20petroleum : 6
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:09<00:00,  1.56s/it]
  2%|█▎                                                                                 | 1/63 [00:00<00:12,  4.78it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  63
Oldest Available Article:  2019-10-23
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:14<00:00,  4.35it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           52
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (11, 6)
Data saved for reliance%20petroleum .
Nifty50 Extraction Search Count : 2


  0%|                                                                                           | 0/49 [00:00<?, ?it/s]

Total number of result pages for hdfc%20bank : 49
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 49/49 [01:13<00:00,  1.50s/it]
  0%|                                                                                          | 0/586 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  586
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 586/586 [07:00<00:00,  1.39it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines            0
Sections             0
Articles           137
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64
Total Usable Scraped Data :  (449, 6)
Data saved for hdfc%20bank .
Nifty50 Extraction Search Count : 3


  0%|                                                                                           | 0/32 [00:00<?, ?it/s]

Total number of result pages for infosys : 32
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [00:47<00:00,  1.47s/it]
  0%|▏                                                                                 | 1/383 [00:00<01:15,  5.03it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  383
Oldest Available Article:  2019-10-21
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 383/383 [02:47<00:00,  2.28it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           83
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (300, 6)
Data saved for infosys .
Nifty50 Extraction Search Count : 4


  0%|                                                                                           | 0/61 [00:00<?, ?it/s]

Total number of result pages for hdfc : 61
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 61/61 [01:29<00:00,  1.46s/it]
  0%|                                                                                  | 1/725 [00:00<02:24,  5.01it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  725
Oldest Available Article:  2019-10-21
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 725/725 [03:59<00:00,  3.03it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines            0
Sections             0
Articles           137
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64
Total Usable Scraped Data :  (588, 6)
Data saved for hdfc .
Nifty50 Extraction Search Count : 5


  0%|                                                                                           | 0/28 [00:00<?, ?it/s]

Total number of result pages for tcs : 28
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:39<00:00,  1.39s/it]
  0%|                                                                                          | 0/330 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  330
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 330/330 [02:19<00:00,  2.36it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           72
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (258, 6)
Data saved for tcs .
Nifty50 Extraction Search Count : 6


  0%|                                                                                           | 0/37 [00:00<?, ?it/s]

Total number of result pages for ICICI%20BANK : 37
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 37/37 [00:53<00:00,  1.44s/it]
  0%|                                                                                          | 0/443 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  443
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 443/443 [03:28<00:00,  2.12it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines            0
Sections             0
Articles           117
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64
Total Usable Scraped Data :  (326, 6)
Data saved for ICICI%20BANK .
Nifty50 Extraction Search Count : 7


  0%|                                                                                           | 0/28 [00:00<?, ?it/s]

Total number of result pages for kotak%20bank : 28
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:40<00:00,  1.44s/it]
  0%|                                                                                          | 0/326 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  326
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 326/326 [02:05<00:00,  2.60it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           91
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (235, 6)
Data saved for kotak%20bank .
Nifty50 Extraction Search Count : 8


  0%|                                                                                           | 0/17 [00:00<?, ?it/s]

Total number of result pages for hul : 17
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:23<00:00,  1.41s/it]
  0%|▍                                                                                 | 1/200 [00:00<00:39,  5.07it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  200
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [01:05<00:00,  3.07it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           49
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (151, 6)
Data saved for hul .
Nifty50 Extraction Search Count : 9


  0%|                                                                                           | 0/20 [00:00<?, ?it/s]

Total number of result pages for itc : 20
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:29<00:00,  1.48s/it]
  0%|                                                                                          | 0/239 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  239
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 239/239 [01:18<00:00,  3.06it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           55
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (184, 6)
Data saved for itc .
Nifty50 Extraction Search Count : 10


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for larsen%20toubro : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.21s/it]
  0%|                                                                                           | 0/36 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  36
Oldest Available Article:  2019-10-23
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [00:27<00:00,  1.32it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Sections           0
Articles           6
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (30, 6)
Data saved for larsen%20toubro .
Nifty50 Extraction Search Count : 11


  0%|                                                                                           | 0/35 [00:00<?, ?it/s]

Total number of result pages for axis%20bank : 35
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [00:50<00:00,  1.45s/it]
  0%|                                                                                          | 0/410 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  410
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


 86%|████████████████████████████████████████████████████████████████████▉           | 353/410 [01:42<00:15,  3.71it/s]

In [9]:
# Scraping news articles for all the companies listed in Nifty50

for i, sstring in enumerate(SearchString):
    if i >= 10:
        print("Nifty50 Extraction Search Count :",i+1)
        initialize(sstring)
        getnewslinks()
        getarticles()
        chkdata()
        savefile(sstring)
        time.sleep(5)

Nifty50 Extraction Search Count : 11


  0%|                                                                                           | 0/35 [00:00<?, ?it/s]

Total number of result pages for axis%20bank : 35
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [00:53<00:00,  1.53s/it]
  0%|                                                                                          | 0/410 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  410
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 410/410 [03:21<00:00,  2.03it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines            0
Sections             0
Articles           124
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64
Total Usable Scraped Data :  (286, 6)
Data saved for axis%20bank .
Nifty50 Extraction Search Count : 12


  0%|                                                                                           | 0/38 [00:00<?, ?it/s]

Total number of result pages for airtel : 38
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 38/38 [00:59<00:00,  1.58s/it]
  0%|                                                                                          | 0/449 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  449
Oldest Available Article:  2019-10-21
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 449/449 [03:52<00:00,  1.93it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           83
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (366, 6)
Data saved for airtel .
Nifty50 Extraction Search Count : 13


  0%|                                                                                           | 0/18 [00:00<?, ?it/s]

Total number of result pages for asian%20paints : 18
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [00:26<00:00,  1.46s/it]
  0%|                                                                                          | 0/205 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  205
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 205/205 [01:02<00:00,  3.30it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           71
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (134, 6)
Data saved for asian%20paints .
Nifty50 Extraction Search Count : 14


  0%|                                                                                           | 0/29 [00:00<?, ?it/s]

Total number of result pages for maruti : 29
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:46<00:00,  1.60s/it]
  0%|▏                                                                                 | 1/344 [00:00<01:09,  4.97it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  344
Oldest Available Article:  2019-10-23
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 344/344 [02:35<00:00,  2.21it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           82
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (262, 6)
Data saved for maruti .
Nifty50 Extraction Search Count : 15


  5%|███▊                                                                               | 1/22 [00:00<00:04,  4.75it/s]

Total number of result pages for hcl : 22
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:29<00:00,  1.33s/it]
  0%|▎                                                                                 | 1/256 [00:00<00:51,  4.98it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  256
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 256/256 [01:12<00:00,  3.52it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           72
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (184, 6)
Data saved for hcl .
Nifty50 Extraction Search Count : 16


  0%|                                                                                           | 0/33 [00:00<?, ?it/s]

Total number of result pages for bajaj%20finance : 33
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 33/33 [00:45<00:00,  1.37s/it]
  0%|                                                                                          | 0/394 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  394
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 394/394 [02:49<00:00,  2.32it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines            0
Sections             0
Articles           105
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64
Total Usable Scraped Data :  (289, 6)
Data saved for bajaj%20finance .
Nifty50 Extraction Search Count : 17


  0%|                                                                                           | 0/46 [00:00<?, ?it/s]

Total number of result pages for sbi : 46
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 46/46 [01:04<00:00,  1.40s/it]
  0%|▏                                                                                 | 1/543 [00:00<01:51,  4.87it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  543
Oldest Available Article:  2019-10-23
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████| 543/543 [1:14:37<00:00,  8.25s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines            0
Sections             0
Articles           108
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64
Total Usable Scraped Data :  (435, 6)
Data saved for sbi .
Nifty50 Extraction Search Count : 18


 12%|██████████▌                                                                         | 1/8 [00:00<00:01,  4.81it/s]

Total number of result pages for reddy : 8
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:11<00:00,  1.44s/it]
  0%|                                                                                           | 0/95 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  95
Oldest Available Article:  2019-10-23
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 95/95 [01:09<00:00,  1.37it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           16
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (79, 6)
Data saved for reddy .
Nifty50 Extraction Search Count : 19


  0%|                                                                                           | 0/35 [00:00<?, ?it/s]

Total number of result pages for mahindra : 35
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [00:54<00:00,  1.57s/it]
  0%|                                                                                          | 0/414 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  414
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 414/414 [03:24<00:00,  2.03it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           94
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (320, 6)
Data saved for mahindra .
Nifty50 Extraction Search Count : 20


  0%|                                                                                           | 0/13 [00:00<?, ?it/s]

Total number of result pages for nestle : 13
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:21<00:00,  1.64s/it]
  0%|                                                                                          | 0/152 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  152
Oldest Available Article:  2019-11-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 152/152 [00:54<00:00,  2.78it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           56
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (96, 6)
Data saved for nestle .
Nifty50 Extraction Search Count : 21


  0%|                                                                                           | 0/21 [00:00<?, ?it/s]

Total number of result pages for sun%20pharma : 21
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:33<00:00,  1.58s/it]
  0%|▎                                                                                 | 1/246 [00:00<00:49,  4.99it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  246
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 246/246 [01:23<00:00,  2.96it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           69
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (177, 6)
Data saved for sun%20pharma .
Nifty50 Extraction Search Count : 22


  0%|                                                                                           | 0/15 [00:00<?, ?it/s]

Total number of result pages for titan : 15
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:21<00:00,  1.44s/it]
  0%|                                                                                          | 0/179 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  179
Oldest Available Article:  2019-11-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 179/179 [01:00<00:00,  2.98it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           54
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (125, 6)
Data saved for titan .
Nifty50 Extraction Search Count : 23


  0%|                                                                                           | 0/21 [00:00<?, ?it/s]

Total number of result pages for tech%20mahindra : 21
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:30<00:00,  1.46s/it]
  0%|                                                                                          | 0/251 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  251
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 251/251 [00:59<00:00,  4.23it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           81
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (170, 6)
Data saved for tech%20mahindra .
Nifty50 Extraction Search Count : 24


  0%|                                                                                           | 0/12 [00:00<?, ?it/s]

Total number of result pages for ultratech : 12
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:16<00:00,  1.37s/it]
  1%|▌                                                                                 | 1/134 [00:00<00:26,  5.06it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  134
Oldest Available Article:  2019-11-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 134/134 [00:39<00:00,  3.41it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           46
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (88, 6)
Data saved for ultratech .
Nifty50 Extraction Search Count : 25


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Total number of result pages for wipro : 6
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:09<00:00,  1.54s/it]
  0%|                                                                                           | 0/68 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  68
Oldest Available Article:  2019-10-24
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 68/68 [00:37<00:00,  1.81it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           13
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (55, 6)
Data saved for wipro .
Nifty50 Extraction Search Count : 26


ValueError: max() arg is an empty sequence

In [20]:
# Scraping news articles for all the companies listed in Nifty50

for i, sstring in enumerate(SearchString):
    if i >= 25:
        print("Nifty50 Extraction Search Count :",i+1)
        initialize(sstring)
        getnewslinks()
        getarticles()
        chkdata()
        savefile(sstring)
        time.sleep(5)

Nifty50 Extraction Search Count : 26


  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

Total number of result pages for britannia : 1
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.72it/s]
  0%|                                                                                            | 0/7 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  7
Oldest Available Article:  2019-12-17
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.02it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Sections           0
Articles           4
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (3, 6)
Data saved for britannia .
Nifty50 Extraction Search Count : 27


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Total number of result pages for hdfc%20insurance : 6
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:09<00:00,  1.51s/it]
  0%|                                                                                           | 0/66 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  66
Oldest Available Article:  2019-10-30
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 66/66 [00:54<00:00,  1.22it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           38
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (28, 6)
Data saved for hdfc%20insurance .
Nifty50 Extraction Search Count : 28


  0%|                                                                                           | 0/17 [00:00<?, ?it/s]

Total number of result pages for powergrid : 17
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:23<00:00,  1.39s/it]
  0%|                                                                                          | 0/199 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  199
Oldest Available Article:  2019-10-24
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 199/199 [02:28<00:00,  1.34it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           60
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (139, 6)
Data saved for powergrid .
Nifty50 Extraction Search Count : 29


  0%|                                                                                           | 0/19 [00:00<?, ?it/s]

Total number of result pages for ntpc : 19
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:26<00:00,  1.41s/it]
  0%|▎                                                                                 | 1/226 [00:00<00:46,  4.79it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  226
Oldest Available Article:  2019-10-25
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 226/226 [01:58<00:00,  1.90it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           71
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (155, 6)
Data saved for ntpc .
Nifty50 Extraction Search Count : 30


  0%|                                                                                           | 0/14 [00:00<?, ?it/s]

Total number of result pages for hero%20motocorp : 14
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:19<00:00,  1.38s/it]
  0%|                                                                                          | 0/162 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  162
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 162/162 [01:26<00:00,  1.87it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           34
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (128, 6)
Data saved for hero%20motocorp .
Nifty50 Extraction Search Count : 31


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for cipla : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.05it/s]
  0%|                                                                                           | 0/18 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  18
Oldest Available Article:  2020-02-09
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [00:11<00:00,  1.59it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Sections           0
Articles           6
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (12, 6)
Data saved for cipla .
Nifty50 Extraction Search Count : 32


  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

Total number of result pages for divis : 1
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.91it/s]
  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  2
Oldest Available Article:  2019-11-05
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.32it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Sections           0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (2, 6)
Data saved for divis .
Nifty50 Extraction Search Count : 33


  0%|                                                                                           | 0/22 [00:00<?, ?it/s]

Total number of result pages for bajaj%20auto : 22
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:35<00:00,  1.59s/it]
  0%|                                                                                          | 0/258 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  258
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [01:42<00:00,  2.52it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines            0
Sections             0
Articles           105
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64
Total Usable Scraped Data :  (153, 6)
Data saved for bajaj%20auto .
Nifty50 Extraction Search Count : 34


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Total number of result pages for bajaj%20finserv : 6
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:07<00:00,  1.29s/it]
  0%|                                                                                           | 0/66 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  66
Oldest Available Article:  2019-12-14
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 66/66 [00:24<00:00,  2.68it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           24
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (42, 6)
Data saved for bajaj%20finserv .
Nifty50 Extraction Search Count : 35


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Total number of result pages for sbi%20insurance : 6
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:08<00:00,  1.50s/it]
  0%|                                                                                           | 0/67 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  67
Oldest Available Article:  2019-10-23
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 67/67 [00:30<00:00,  2.17it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           27
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (40, 6)
Data saved for sbi%20insurance .
Nifty50 Extraction Search Count : 36


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for eicher : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.54it/s]
  5%|███▉                                                                               | 1/21 [00:00<00:03,  5.03it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  21
Oldest Available Article:  2020-01-01
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:12<00:00,  1.69it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Sections           0
Articles           4
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (17, 6)
Data saved for eicher .
Nifty50 Extraction Search Count : 37


  0%|                                                                                           | 0/31 [00:00<?, ?it/s]

Total number of result pages for indusind : 31
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:44<00:00,  1.45s/it]
  0%|                                                                                          | 0/370 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  370
Oldest Available Article:  2019-10-24
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 370/370 [02:26<00:00,  2.53it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines            0
Sections             0
Articles           100
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64
Total Usable Scraped Data :  (270, 6)
Data saved for indusind .
Nifty50 Extraction Search Count : 38


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.97it/s]

Total number of result pages for grasim : 1
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.94it/s]
 20%|████████████████▊                                                                   | 1/5 [00:00<00:00,  5.07it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  5
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.07it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Sections           0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (5, 6)
Data saved for grasim .
Nifty50 Extraction Search Count : 39


  0%|                                                                                            | 0/8 [00:00<?, ?it/s]

Total number of result pages for bpcl : 8
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:12<00:00,  1.56s/it]
  0%|                                                                                           | 0/88 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  88
Oldest Available Article:  2019-10-23
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 88/88 [01:00<00:00,  1.46it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           14
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (74, 6)
Data saved for bpcl .
Nifty50 Extraction Search Count : 40


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for jsw : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.30s/it]
  0%|                                                                                           | 0/39 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  39
Oldest Available Article:  2019-10-23
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 39/39 [00:25<00:00,  1.51it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Sections           0
Articles           1
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (38, 6)
Data saved for jsw .
Nifty50 Extraction Search Count : 41


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.98it/s]

Total number of result pages for upl : 1
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.95it/s]
  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  3
Oldest Available Article:  2020-05-20
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.82it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Sections           0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (3, 6)
Data saved for upl .
Nifty50 Extraction Search Count : 42


  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

Total number of result pages for shree%20cement : 1
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.85it/s]
  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  5
Oldest Available Article:  2019-11-29
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  2.77it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Sections           0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (2, 6)
Data saved for shree%20cement .
Nifty50 Extraction Search Count : 43


  0%|                                                                                           | 0/27 [00:00<?, ?it/s]

Total number of result pages for tata%20steel : 27
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 27/27 [00:39<00:00,  1.45s/it]
  0%|                                                                                          | 0/322 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  322
Oldest Available Article:  2019-10-24
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 322/322 [02:01<00:00,  2.65it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           88
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (234, 6)
Data saved for tata%20steel .
Nifty50 Extraction Search Count : 44


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.98it/s]

Total number of result pages for hindalco : 1
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.95it/s]
  0%|                                                                                            | 0/9 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  9
Oldest Available Article:  2019-11-11
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:05<00:00,  1.59it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Sections           0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (9, 6)
Data saved for hindalco .
Nifty50 Extraction Search Count : 45


  0%|                                                                                            | 0/7 [00:00<?, ?it/s]

Total number of result pages for adani : 7
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.38s/it]
  0%|                                                                                           | 0/84 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  84
Oldest Available Article:  2019-10-29
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 84/84 [00:51<00:00,  1.63it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           13
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (71, 6)
Data saved for adani .
Nifty50 Extraction Search Count : 46


  0%|                                                                                           | 0/23 [00:00<?, ?it/s]

Total number of result pages for ongc : 23
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:29<00:00,  1.30s/it]
  0%|▎                                                                                 | 1/272 [00:00<00:54,  4.98it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  272
Oldest Available Article:  2019-10-23
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 272/272 [01:43<00:00,  2.62it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           73
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (199, 6)
Data saved for ongc .
Nifty50 Extraction Search Count : 47


  0%|                                                                                            | 0/9 [00:00<?, ?it/s]

Total number of result pages for coal%20india : 9
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:12<00:00,  1.35s/it]
  0%|                                                                                          | 0/105 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  105
Oldest Available Article:  2019-10-31
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 105/105 [01:02<00:00,  1.68it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           24
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (81, 6)
Data saved for coal%20india .
Nifty50 Extraction Search Count : 48


  0%|                                                                                           | 0/17 [00:00<?, ?it/s]

Total number of result pages for tata%20motors : 17
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:22<00:00,  1.34s/it]
  1%|▍                                                                                 | 1/196 [00:00<00:41,  4.75it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  196
Oldest Available Article:  2019-10-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 196/196 [01:38<00:00,  1.98it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           39
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (157, 6)
Data saved for tata%20motors .
Nifty50 Extraction Search Count : 49


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Total number of result pages for ioc : 6
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:08<00:00,  1.45s/it]
  0%|                                                                                           | 0/64 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  64
Oldest Available Article:  2019-10-23
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 64/64 [00:25<00:00,  2.54it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Sections            0
Articles           10
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (54, 6)
Data saved for ioc .
Nifty50 Extraction Search Count : 50


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for gail : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.03s/it]
  0%|                                                                                           | 0/27 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  27
Oldest Available Article:  2020-01-08
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 27/27 [00:12<00:00,  2.22it/s]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Sections           0
Articles           5
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (22, 6)
Data saved for gail .


In [None]:
# The pages with blank Articles have a different structure.
# These pages have a byline 'The Hindu Net Desk' and can be separately scraped and preprocessed.