In [1]:
from bs4 import BeautifulSoup
from requests import get
import pandas as pd
from tqdm import tqdm
from dateutil import parser
import string
import re
import time

In [2]:
# Scrape the latest Nifty50 Index Composition with previous day details, such as, company names, prices, change etc.

getnifty50 = "https://www.moneycontrol.com/stocks/marketstats/indexcomp.php?optex=NSE&opttopic=indexcomp&index=9"
soup = BeautifulSoup(get(getnifty50).text, 'lxml')
composition_n50 = soup.select('table.tbldata14.bdrtpg')[0]

Scrape_date = parser.parse(soup.select('div.FR.b_15.PT5')[0].text)
Company_Name = [script.text.strip() for script in composition_n50.select("a.bl_12")[2::2]]
Industry = [script.text.strip() for script in composition_n50.select("a.bl_12")[3::2]]
# urlsplit = [script.get('href').split('/')[-1] for script in composition_n50.select("a.bl_12")[2::2]]

Last_Price = [script.text.strip() for script in composition_n50.select('td.brdrgtgry')[2::6]]
Change = [script.text.strip() for script in composition_n50.select('td.brdrgtgry')[3::6]]
Change_percent = [script.text.strip() for script in composition_n50.select('td.brdrgtgry')[4::6]]
Mrk_Cap = [script.text.strip() for script in composition_n50.select('td.brdrgtgry')[5::6]]


nifty50_latest = pd.DataFrame({
    'Company_Name' : Company_Name,
    'Industry' : Industry,
#     'urlsplit' : urlsplit,
    'Last_Price' : Last_Price,
    'Change' : Change,
    'Change_percent' : Change_percent,
    'Mrk_Cap(Rs Cr)' : Mrk_Cap
})

nifty50_latest['Scrape_date'] = Scrape_date
print(nifty50_latest.shape)
nifty50_latest.to_csv("nifty50_latest.csv")
nifty50_latest.head()

(50, 7)


Unnamed: 0,Company_Name,Industry,Last_Price,Change,Change_percent,Mrk_Cap(Rs Cr),Scrape_date
0,Adani Ports,Transport Infrastructure,368.65,7.55,2.09,74900.53,2020-11-05 15:59:00
1,Asian Paints,Paints,2238.3,68.85,3.17,214697.24,2020-11-05 15:59:00
2,Axis Bank,Bank - Private,539.3,14.15,2.69,165036.05,2020-11-05 15:59:00
3,Bajaj Auto,Automobile - 2 & 3 Wheelers,2949.3,23.2,0.79,85343.02,2020-11-05 15:59:00
4,Bajaj Finance,Finance - NBFC,3736.9,175.45,4.93,225180.86,2020-11-05 15:59:00


In [3]:
# Lookup Table with Nifty50 stocks and MoneyControl url sub-strings
nifty50_lookuptable = pd.read_csv("nifty50_lookuptable.csv")
Substring = [i for i in nifty50_lookuptable['mcontrol_substring']]
# cnames = [i for i in nifty50_lookuptable['Company Name']]
print(nifty50_lookuptable.shape)
nifty50_lookuptable.head()

(50, 6)


Unnamed: 0,Sr.No.,Company Name,Sector,Weightage,thehindu_searchstring,mcontrol_substring
0,1,Reliance Industries Ltd.,Petroleum Products,14.93%,reliance%20petroleum,RI
1,2,HDFC Bank Ltd.,Banks,9.69%,hdfc%20bank,HDF01
2,3,Infosys Limited,Software,7.63%,infosys,IT
3,4,Housing Development Fin. Corp. Ltd.,Finance,6.44%,hdfc,HDF
4,5,Tata Consultancy Services Ltd.,Software,5.41%,tcs,TCS


In [4]:
def initialize(yr, urlsplit):
    '''
    Function to obtain total number of result pages, initialize blank news data and
    set urls for moneycontrol news search page for the input year 'yr'.
    '''
    global ticker, url_all, headlines, dates, news, urls, sources
    
    urlyr = "https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id=" + urlsplit + "&durationType=Y&Year={}"
    url = "https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id={}&scat=&pageno={}&next=0&durationType=Y&Year={}&duration=1&news_type="
    
    soup = BeautifulSoup(get(urlyr.format(yr)).text, 'lxml')
    ticker = soup.select('div.FL.gry10')[0].text.split('|')[1].split(':')[1].strip()
    result_max = len(soup.select('div.pages.MR10.MT15')[0].select('a')) + 1
    
    url_all = [url.format(urlsplit, i, yr) for i in range(1, result_max+1)]
    headlines, dates, news, urls, sources = [], [], [], [], []
    print("Total number of result pages for", ticker, "in the year", yr, ":", result_max)

In [5]:
def getnewslinks():
    '''
    Function to scrape news headlines, urls, publish dates etc.
    '''
    print("[INFO] Extracting Links...")

    for src in tqdm(url_all):

        try:
            soup = BeautifulSoup(get(src).text, 'lxml')

            # Extracts the Headlines
            try:
                headline = [script.text.strip() for script in soup.select('a.g_14bl')]
                headlines.extend(headline)
            except:
                print('Exception in Headline')
                headlines.extend(None)

            # Extracts the urls
            try:
                source = ["https://www.moneycontrol.com"+script.get('href') for script in soup.select('a.g_14bl')]
                urls.extend(source)
            except:
                print('Exception in url')
                urls.extend(None)

            # Extracts the published dates
            try:
                dateline = [str(parser.parse(script.text.split('|')[1].strip())).split()[0] for script in soup.select('p.PT3.a_10dgry')]
                dates.extend(dateline)
            except:
                print('Exception in dateline')
                dates.extend(None)

            # Extracts the bylines
            try:
                bylines = [script.select('span.a_2_10bl')[0].text.strip() if len(script.select('span.a_2_10bl'))==1 else None
                           for script in soup.select('p.PT3.a_10dgry')]
                sources.extend(bylines)
            except:
                print('Exception in bylines')
                sources.extend(None)

        except:
            print("Exception occurred in url : ", src)
            break

    print("[INFO] Links Extracted.")
    print("Total No. of Pages to be Scraped = ", len(urls))
    print("Oldest Available Article: ", min(dates))

In [6]:
def getarticles(thres=7):
    '''
    Function to scrape news articles. Any paragraph with words less than 'thres' will not be considered.
    '''
    print("[INFO] Extracting Articles...")

    for src in tqdm(urls):
        try:
            # Parse the url to NewsPage
            soup = BeautifulSoup(get(src).text, 'lxml')

            # Extracts the news articles
            try:
                news_article = '.'.join([scrape.text.strip() for scrape in soup.select("div.arti-flow")[0].select("p")
                                         if len(scrape.text.split()) >= thres])
                news.append(news_article)
            except:
                news.append(None)

        except:
            print("Exception occurred in url : ", src)
            news.append(None)

    print("[INFO] Articles Extracted.")

In [7]:
def chkdata():
    '''
    Function to check for any missing values in the Dataframe and drop it.
    '''
    global df
    df = pd.DataFrame({'Headlines': headlines,
                       'Articles': news,
                       'Published_Dates': dates,
                       'Source_URLs': urls,
                       'ByLines' : sources
                       })
    print("Missing Info in Scraped Data :")
    print(df.isna().sum())
    df=df.dropna(axis = 0)
    print("Total Usable Scraped Data : ", df.shape)

In [8]:
def savefile(tickr,yr):
    '''
    Function to save the scraped data as pickle file.
    '''
    # df.to_csv("news_mcontrol_"+ tickr + "_" + str(yr) + ".csv")
    df.to_pickle("news_mcontrol_"+ tickr + "_" + str(yr) + ".pkl")
    print("Data saved for", tickr, "for year",yr, ".")

In [None]:
# Scraping 2019 news articles for all the companies listed in Nifty50

yr = 2019

for i, sstring in enumerate(Substring):
    print("Nifty50 Extraction Search Count :",i+1)
    initialize(yr, sstring)
    getnewslinks()
    getarticles()
    chkdata()
    savefile(ticker, yr)
    time.sleep(5)

Nifty50 Extraction Search Count : 1


  0%|                                                                                            | 0/7 [00:00<?, ?it/s]

Total number of result pages for RELIANCE in the year 2019 : 7
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:04<00:00,  1.54it/s]
  0%|                                                                                          | 0/138 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  138
Oldest Available Article:  2019-01-06
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 138/138 [05:16<00:00,  2.29s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           7
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (131, 5)
Data saved for RELIANCE for year 2019 .
Nifty50 Extraction Search Count : 2


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Total number of result pages for HDFCBANK in the year 2019 : 5
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.53it/s]
  0%|                                                                                           | 0/92 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  92
Oldest Available Article:  2019-01-14
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 92/92 [03:34<00:00,  2.33s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           8
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (84, 5)
Data saved for HDFCBANK for year 2019 .
Nifty50 Extraction Search Count : 3


  0%|                                                                                            | 0/7 [00:00<?, ?it/s]

Total number of result pages for INFY in the year 2019 : 7
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:04<00:00,  1.51it/s]
  0%|                                                                                          | 0/132 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  132
Oldest Available Article:  2019-01-04
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 132/132 [05:23<00:00,  2.45s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           7
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (125, 5)
Data saved for INFY for year 2019 .
Nifty50 Extraction Search Count : 4


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for HDFC in the year 2019 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.45it/s]
  0%|                                                                                           | 0/57 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  57
Oldest Available Article:  2019-01-14
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 57/57 [02:12<00:00,  2.32s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (54, 5)
Data saved for HDFC for year 2019 .
Nifty50 Extraction Search Count : 5


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Total number of result pages for TCS in the year 2019 : 6
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.46it/s]
  0%|                                                                                          | 0/110 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  110
Oldest Available Article:  2019-01-08
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 110/110 [04:18<00:00,  2.35s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           6
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (104, 5)
Data saved for TCS for year 2019 .
Nifty50 Extraction Search Count : 6


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Total number of result pages for ICICIBANK in the year 2019 : 6
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.43it/s]
  0%|                                                                                          | 0/110 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  110
Oldest Available Article:  2019-01-02
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 110/110 [04:14<00:00,  2.31s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (107, 5)
Data saved for ICICIBANK for year 2019 .
Nifty50 Extraction Search Count : 7


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for KOTAKBANK in the year 2019 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.45it/s]
  0%|                                                                                           | 0/62 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  62
Oldest Available Article:  2019-01-01
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 62/62 [02:21<00:00,  2.28s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           7
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (55, 5)
Data saved for KOTAKBANK for year 2019 .
Nifty50 Extraction Search Count : 8


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Total number of result pages for HINDUNILVR in the year 2019 : 5
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:09<00:00,  1.92s/it]
  0%|                                                                                           | 0/88 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  88
Oldest Available Article:  2019-01-10
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 88/88 [03:09<00:00,  2.15s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           11
Published_Dates     0
Source_URLs         0
ByLines             1
dtype: int64
Total Usable Scraped Data :  (77, 5)
Data saved for HINDUNILVR for year 2019 .
Nifty50 Extraction Search Count : 9


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for ITC in the year 2019 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.35it/s]
  0%|                                                                                           | 0/77 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  77
Oldest Available Article:  2019-01-10
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 77/77 [02:28<00:00,  1.93s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           2
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (75, 5)
Data saved for ITC for year 2019 .
Nifty50 Extraction Search Count : 10


  0%|                                                                                            | 0/7 [00:00<?, ?it/s]

Total number of result pages for LT in the year 2019 : 7
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:28<00:00,  4.10s/it]
  0%|                                                                                          | 0/139 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  139
Oldest Available Article:  2019-01-03
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 139/139 [05:49<00:00,  2.51s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           8
Published_Dates    0
Source_URLs        0
ByLines            1
dtype: int64
Total Usable Scraped Data :  (130, 5)
Data saved for LT for year 2019 .
Nifty50 Extraction Search Count : 11


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Total number of result pages for AXISBANK in the year 2019 : 5
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:26<00:00,  5.23s/it]
  0%|                                                                                           | 0/92 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  92
Oldest Available Article:  2019-01-08
[INFO] Extracting Articles...


 23%|██████████████████▋                                                               | 21/92 [00:58<04:47,  4.04s/it]

Exception occurred in url :  https://www.moneycontrol.com/news/business/ready-âto-âsacrificeâ-growthâ-inâ-some-segmentsâ-to-pursue-sustainability-axis-bank_12845341.html


100%|██████████████████████████████████████████████████████████████████████████████████| 92/92 [03:55<00:00,  2.56s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           7
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (85, 5)
Data saved for AXISBANK for year 2019 .
Nifty50 Extraction Search Count : 12


  0%|                                                                                            | 0/7 [00:00<?, ?it/s]

Total number of result pages for BHARTIARTL in the year 2019 : 7
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:43<00:00,  6.20s/it]
  0%|                                                                                          | 0/127 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  127
Oldest Available Article:  2019-01-04
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 127/127 [04:58<00:00,  2.35s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (124, 5)
Data saved for BHARTIARTL for year 2019 .
Nifty50 Extraction Search Count : 13


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for ASIANPAINT in the year 2019 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:19<00:00,  6.55s/it]
  0%|                                                                                           | 0/50 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  50
Oldest Available Article:  2019-01-10
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:50<00:00,  2.22s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           6
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (44, 5)
Data saved for ASIANPAINT for year 2019 .
Nifty50 Extraction Search Count : 14


  0%|                                                                                           | 0/11 [00:00<?, ?it/s]

Total number of result pages for MARUTI in the year 2019 : 11
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:07<00:00,  1.47it/s]
  0%|                                                                                          | 0/201 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  201
Oldest Available Article:  2019-01-01
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 201/201 [07:50<00:00,  2.34s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           8
Published_Dates    0
Source_URLs        0
ByLines            1
dtype: int64
Total Usable Scraped Data :  (192, 5)
Data saved for MARUTI for year 2019 .
Nifty50 Extraction Search Count : 15


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for HCLTECH in the year 2019 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.48it/s]
  0%|                                                                                           | 0/70 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  70
Oldest Available Article:  2019-01-12
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [02:44<00:00,  2.35s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           5
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (65, 5)
Data saved for HCLTECH for year 2019 .
Nifty50 Extraction Search Count : 16


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for BAJFINANCE in the year 2019 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  1.56it/s]
  0%|                                                                                           | 0/49 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  49
Oldest Available Article:  2019-01-04
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 49/49 [01:56<00:00,  2.38s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           6
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (43, 5)
Data saved for BAJFINANCE for year 2019 .
Nifty50 Extraction Search Count : 17


  0%|                                                                                           | 0/11 [00:00<?, ?it/s]

Total number of result pages for SBIN in the year 2019 : 11
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:07<00:00,  1.54it/s]
  0%|                                                                                          | 0/206 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  206
Oldest Available Article:  2019-01-07
[INFO] Extracting Articles...


  1%|█▏                                                                                | 3/206 [00:08<09:01,  2.67s/it]

In [None]:
# Scraping 2019 news articles for all the companies listed in Nifty50

yr = 2019

for i, sstring in enumerate(Substring):
    if i>=16:
        print("Nifty50 Extraction Search Count :",i+1)
        initialize(yr, sstring)
        getnewslinks()
        getarticles()
        chkdata()
        savefile(ticker, yr)
        time.sleep(5)

Nifty50 Extraction Search Count : 17


  0%|                                                                                           | 0/11 [00:00<?, ?it/s]

Total number of result pages for SBIN in the year 2019 : 11
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:55<00:00,  5.01s/it]
  0%|                                                                                          | 0/206 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  206
Oldest Available Article:  2019-01-07
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 206/206 [08:26<00:00,  2.46s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           5
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (201, 5)
Data saved for SBIN for year 2019 .
Nifty50 Extraction Search Count : 18


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for DRREDDY in the year 2019 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.49it/s]
  0%|                                                                                           | 0/67 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  67
Oldest Available Article:  2019-01-03
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 67/67 [02:38<00:00,  2.37s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           4
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (63, 5)
Data saved for DRREDDY for year 2019 .
Nifty50 Extraction Search Count : 19


  0%|                                                                                            | 0/7 [00:00<?, ?it/s]

Total number of result pages for M&M in the year 2019 : 7
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:04<00:00,  1.47it/s]
  0%|                                                                                          | 0/121 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  121
Oldest Available Article:  2019-01-01
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 121/121 [05:01<00:00,  2.49s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           6
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (115, 5)
Data saved for M&M for year 2019 .
Nifty50 Extraction Search Count : 20


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for NESTLEIND in the year 2019 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:14<00:00,  7.13s/it]
  0%|                                                                                           | 0/36 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  36
Oldest Available Article:  2019-01-07
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [01:28<00:00,  2.46s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           2
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (34, 5)
Data saved for NESTLEIND for year 2019 .
Nifty50 Extraction Search Count : 21


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for SUNPHARMA in the year 2019 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.55it/s]
  0%|                                                                                           | 0/75 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  75
Oldest Available Article:  2019-01-09
[INFO] Extracting Articles...


 93%|████████████████████████████████████████████████████████████████████████████▌     | 70/75 [02:48<00:12,  2.52s/it]

In [9]:
# Scraping 2019 news articles for all the companies listed in Nifty50

yr = 2019

for i, sstring in enumerate(Substring):
    if i>=20:
        print("Nifty50 Extraction Search Count :",i+1)
        initialize(yr, sstring)
        getnewslinks()
        getarticles()
        chkdata()
        savefile(ticker, yr)
        time.sleep(5)

Nifty50 Extraction Search Count : 21


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for SUNPHARMA in the year 2019 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:07<00:00,  1.95s/it]
  0%|                                                                                           | 0/75 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  75
Oldest Available Article:  2019-01-09
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 75/75 [04:21<00:00,  3.49s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           1
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (74, 5)
Data saved for SUNPHARMA for year 2019 .
Nifty50 Extraction Search Count : 22


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for TITAN in the year 2019 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:08<00:00,  2.98s/it]
  0%|                                                                                           | 0/60 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  60
Oldest Available Article:  2019-01-07
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [03:37<00:00,  3.62s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           6
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (54, 5)
Data saved for TITAN for year 2019 .
Nifty50 Extraction Search Count : 23


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for TECHM in the year 2019 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.56it/s]
  0%|                                                                                           | 0/64 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  64
Oldest Available Article:  2019-01-01
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 64/64 [02:55<00:00,  2.74s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (61, 5)
Data saved for TECHM for year 2019 .
Nifty50 Extraction Search Count : 24


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for ULTRACEMCO in the year 2019 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:07<00:00,  2.37s/it]
  0%|                                                                                           | 0/54 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  54
Oldest Available Article:  2019-01-10
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 54/54 [02:23<00:00,  2.67s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            1
dtype: int64
Total Usable Scraped Data :  (50, 5)
Data saved for ULTRACEMCO for year 2019 .
Nifty50 Extraction Search Count : 25


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for WIPRO in the year 2019 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:21<00:00,  5.41s/it]
  0%|                                                                                           | 0/77 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  77
Oldest Available Article:  2019-01-12
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 77/77 [03:18<00:00,  2.57s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           4
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (73, 5)
Data saved for WIPRO for year 2019 .
Nifty50 Extraction Search Count : 26


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for BRITANNIA in the year 2019 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:19<00:00,  6.60s/it]
  0%|                                                                                           | 0/44 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  44
Oldest Available Article:  2019-01-10
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 44/44 [02:10<00:00,  2.97s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           4
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (40, 5)
Data saved for BRITANNIA for year 2019 .
Nifty50 Extraction Search Count : 27


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for HDFCLIFE in the year 2019 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:19<00:00,  9.70s/it]
  0%|                                                                                           | 0/37 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  37
Oldest Available Article:  2019-01-14
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 37/37 [02:00<00:00,  3.26s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (37, 5)
Data saved for HDFCLIFE for year 2019 .
Nifty50 Extraction Search Count : 28


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for POWERGRID in the year 2019 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:13<00:00,  6.60s/it]
  0%|                                                                                           | 0/32 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  32
Oldest Available Article:  2019-01-14
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [01:30<00:00,  2.81s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (32, 5)
Data saved for POWERGRID for year 2019 .
Nifty50 Extraction Search Count : 29


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for NTPC in the year 2019 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:22<00:00,  7.38s/it]
  0%|                                                                                           | 0/50 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  50
Oldest Available Article:  2019-01-11
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [02:03<00:00,  2.47s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           1
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (49, 5)
Data saved for NTPC for year 2019 .
Nifty50 Extraction Search Count : 30


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Total number of result pages for HEROMOTOCO in the year 2019 : 5
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:30<00:00,  6.12s/it]
  0%|                                                                                           | 0/88 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  88
Oldest Available Article:  2019-01-02
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 88/88 [04:40<00:00,  3.19s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (85, 5)
Data saved for HEROMOTOCO for year 2019 .
Nifty50 Extraction Search Count : 31


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for CIPLA in the year 2019 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:20<00:00,  6.90s/it]
  0%|                                                                                           | 0/52 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  52
Oldest Available Article:  2019-01-01
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 52/52 [03:11<00:00,  3.68s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (52, 5)
Data saved for CIPLA for year 2019 .
Nifty50 Extraction Search Count : 32


  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

Total number of result pages for DIVISLAB in the year 2019 : 1
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:11<00:00, 11.02s/it]
  0%|                                                                                           | 0/19 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  19
Oldest Available Article:  2019-01-11
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:52<00:00,  2.76s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           1
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (18, 5)
Data saved for DIVISLAB for year 2019 .
Nifty50 Extraction Search Count : 33


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Total number of result pages for BAJAJ-AUTO in the year 2019 : 6
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:36<00:00,  6.12s/it]
  0%|                                                                                          | 0/118 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  118
Oldest Available Article:  2019-01-04
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 118/118 [04:55<00:00,  2.51s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           6
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (112, 5)
Data saved for BAJAJ-AUTO for year 2019 .
Nifty50 Extraction Search Count : 34


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for BAJAJFINSV in the year 2019 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:13<00:00,  6.65s/it]
  0%|                                                                                           | 0/24 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  24
Oldest Available Article:  2019-01-29
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 24/24 [01:20<00:00,  3.34s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           1
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (23, 5)
Data saved for BAJAJFINSV for year 2019 .
Nifty50 Extraction Search Count : 35


  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

Total number of result pages for SBILIFE in the year 2019 : 1
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.46s/it]
  0%|                                                                                           | 0/19 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  19
Oldest Available Article:  2019-01-18
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:53<00:00,  2.83s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (19, 5)
Data saved for SBILIFE for year 2019 .
Nifty50 Extraction Search Count : 36


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for EICHERMOT in the year 2019 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:27<00:00,  6.78s/it]
  0%|                                                                                           | 0/63 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  63
Oldest Available Article:  2019-01-01
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [02:56<00:00,  2.80s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (60, 5)
Data saved for EICHERMOT for year 2019 .
Nifty50 Extraction Search Count : 37


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for INDUSINDBK in the year 2019 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.39it/s]
  0%|                                                                                           | 0/58 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  58
Oldest Available Article:  2019-01-08
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 58/58 [02:59<00:00,  3.10s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           4
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (54, 5)
Data saved for INDUSINDBK for year 2019 .
Nifty50 Extraction Search Count : 38


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for GRASIM in the year 2019 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.60it/s]
  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  1
Oldest Available Article:  2019-01-21
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.95s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (1, 5)
Data saved for GRASIM for year 2019 .
Nifty50 Extraction Search Count : 39


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for BPCL in the year 2019 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.45s/it]
  0%|                                                                                           | 0/47 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  47
Oldest Available Article:  2019-01-11
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 47/47 [02:30<00:00,  3.20s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           1
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (46, 5)
Data saved for BPCL for year 2019 .
Nifty50 Extraction Search Count : 40


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for JSWSTEEL in the year 2019 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:24<00:00,  6.20s/it]
  0%|                                                                                           | 0/73 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  73
Oldest Available Article:  2019-01-01
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 73/73 [03:13<00:00,  2.65s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           2
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (71, 5)
Data saved for JSWSTEEL for year 2019 .
Nifty50 Extraction Search Count : 41


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for UPL in the year 2019 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:13<00:00,  6.72s/it]
  0%|                                                                                           | 0/27 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  27
Oldest Available Article:  2019-01-16
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 27/27 [01:12<00:00,  2.67s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           1
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (26, 5)
Data saved for UPL for year 2019 .
Nifty50 Extraction Search Count : 42


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for SHREECEM in the year 2019 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.51it/s]
  0%|                                                                                           | 0/28 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  28
Oldest Available Article:  2019-01-10
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [01:16<00:00,  2.73s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           1
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (27, 5)
Data saved for SHREECEM for year 2019 .
Nifty50 Extraction Search Count : 43


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Total number of result pages for TATASTEEL in the year 2019 : 6
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:40<00:00,  6.68s/it]
  0%|                                                                                          | 0/102 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  102
Oldest Available Article:  2019-01-02
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 102/102 [04:25<00:00,  2.60s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           2
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (100, 5)
Data saved for TATASTEEL for year 2019 .
Nifty50 Extraction Search Count : 44


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for HINDALCO in the year 2019 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:15<00:00,  7.53s/it]
  0%|                                                                                           | 0/39 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  39
Oldest Available Article:  2019-01-07
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 39/39 [01:38<00:00,  2.53s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           1
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (38, 5)
Data saved for HINDALCO for year 2019 .
Nifty50 Extraction Search Count : 45


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for ADANIPORTS in the year 2019 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:16<00:00,  8.49s/it]
  0%|                                                                                           | 0/24 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  24
Oldest Available Article:  2019-01-03
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 24/24 [01:04<00:00,  2.69s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           1
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (23, 5)
Data saved for ADANIPORTS for year 2019 .
Nifty50 Extraction Search Count : 46


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for ONGC in the year 2019 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  1.61it/s]
  0%|                                                                                           | 0/47 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  47
Oldest Available Article:  2019-01-06
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 47/47 [02:14<00:00,  2.87s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (47, 5)
Data saved for ONGC for year 2019 .
Nifty50 Extraction Search Count : 47


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for COALINDIA in the year 2019 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.01it/s]
  0%|                                                                                           | 0/76 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  76
Oldest Available Article:  2019-01-01
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 76/76 [04:01<00:00,  3.18s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           1
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (75, 5)
Data saved for COALINDIA for year 2019 .
Nifty50 Extraction Search Count : 48


  0%|                                                                                            | 0/9 [00:00<?, ?it/s]

Total number of result pages for TATAMOTORS in the year 2019 : 9
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:08<00:00,  1.05it/s]
  0%|                                                                                          | 0/172 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  172
Oldest Available Article:  2019-01-03
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 172/172 [07:17<00:00,  2.55s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           5
Published_Dates    0
Source_URLs        0
ByLines            1
dtype: int64
Total Usable Scraped Data :  (166, 5)
Data saved for TATAMOTORS for year 2019 .
Nifty50 Extraction Search Count : 49


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for IOC in the year 2019 : 4
[INFO] Extracting Links...


 50%|██████████████████████████████████████████                                          | 2/4 [00:01<00:01,  1.13it/s]
  0%|                                                                                           | 0/40 [00:00<?, ?it/s]

Exception occurred in url :  https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id=IOC&scat=&pageno=3&next=0&durationType=Y&Year=2019&duration=1&news_type=
[INFO] Links Extracted.
Total No. of Pages to be Scraped =  40
Oldest Available Article:  2019-05-17
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 40/40 [01:45<00:00,  2.63s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (40, 5)
Data saved for IOC for year 2019 .
Nifty50 Extraction Search Count : 50


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for GAIL in the year 2019 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:19<00:00,  6.51s/it]
  0%|                                                                                           | 0/48 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  48
Oldest Available Article:  2019-01-09
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [02:06<00:00,  2.63s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           1
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (47, 5)
Data saved for GAIL for year 2019 .


In [None]:
# The pages with blank Articles might have a different structure.
# There is a text '\xa0' in the articles that was '&nbsp;' in the html. This will be replaced by ' ' while preprocessing.
# Some pages have no byline. Of these, some have video articles.

# Below article url is throwing error due to 'TooManyRedirects: Exceeded 30 redirects.'
# https://www.moneycontrol.com/news/business/ready-%C3%A2%C2%80%C2%8Ato-%C3%A2%C2%80%C2%8Asacrifice%C3%A2%C2%80%C2%8A-growth%C3%A2%C2%80%C2%8A-in%C3%A2%C2%80%C2%8A-some-segments%C3%A2%C2%80%C2%8A-to-pursue-sustainability-axis-bank_12845341.html
