In [1]:
from bs4 import BeautifulSoup
from requests import get
import pandas as pd
from tqdm import tqdm
from dateutil import parser
import string
import re
import time

In [2]:
# Scrape the latest Nifty50 Index Composition with previous day details, such as, company names, prices, change etc.

getnifty50 = "https://www.moneycontrol.com/stocks/marketstats/indexcomp.php?optex=NSE&opttopic=indexcomp&index=9"
soup = BeautifulSoup(get(getnifty50).text, 'lxml')
composition_n50 = soup.select('table.tbldata14.bdrtpg')[0]

Scrape_date = parser.parse(soup.select('div.FR.b_15.PT5')[0].text)
Company_Name = [script.text.strip() for script in composition_n50.select("a.bl_12")[2::2]]
Industry = [script.text.strip() for script in composition_n50.select("a.bl_12")[3::2]]
# urlsplit = [script.get('href').split('/')[-1] for script in composition_n50.select("a.bl_12")[2::2]]

Last_Price = [script.text.strip() for script in composition_n50.select('td.brdrgtgry')[2::6]]
Change = [script.text.strip() for script in composition_n50.select('td.brdrgtgry')[3::6]]
Change_percent = [script.text.strip() for script in composition_n50.select('td.brdrgtgry')[4::6]]
Mrk_Cap = [script.text.strip() for script in composition_n50.select('td.brdrgtgry')[5::6]]


nifty50_latest = pd.DataFrame({
    'Company_Name' : Company_Name,
    'Industry' : Industry,
#     'urlsplit' : urlsplit,
    'Last_Price' : Last_Price,
    'Change' : Change,
    'Change_percent' : Change_percent,
    'Mrk_Cap(Rs Cr)' : Mrk_Cap
})

nifty50_latest['Scrape_date'] = Scrape_date
print(nifty50_latest.shape)
nifty50_latest.to_csv("nifty50_latest.csv")
nifty50_latest.head()

(50, 7)


Unnamed: 0,Company_Name,Industry,Last_Price,Change,Change_percent,Mrk_Cap(Rs Cr),Scrape_date
0,Adani Ports,Transport Infrastructure,368.65,7.55,2.09,74900.53,2020-11-05 15:59:00
1,Asian Paints,Paints,2238.3,68.85,3.17,214697.24,2020-11-05 15:59:00
2,Axis Bank,Bank - Private,539.3,14.15,2.69,165036.05,2020-11-05 15:59:00
3,Bajaj Auto,Automobile - 2 & 3 Wheelers,2949.3,23.2,0.79,85343.02,2020-11-05 15:59:00
4,Bajaj Finance,Finance - NBFC,3736.9,175.45,4.93,225180.86,2020-11-05 15:59:00


In [3]:
# Lookup Table with Nifty50 stocks and MoneyControl url sub-strings
nifty50_lookuptable = pd.read_csv("nifty50_lookuptable.csv")
Substring = [i for i in nifty50_lookuptable['mcontrol_substring']]
# cnames = [i for i in nifty50_lookuptable['Company Name']]
print(nifty50_lookuptable.shape)
nifty50_lookuptable.head()

(50, 6)


Unnamed: 0,Sr.No.,Company Name,Sector,Weightage,thehindu_searchstring,mcontrol_substring
0,1,Reliance Industries Ltd.,Petroleum Products,14.93%,reliance%20petroleum,RI
1,2,HDFC Bank Ltd.,Banks,9.69%,hdfc%20bank,HDF01
2,3,Infosys Limited,Software,7.63%,infosys,IT
3,4,Housing Development Fin. Corp. Ltd.,Finance,6.44%,hdfc,HDF
4,5,Tata Consultancy Services Ltd.,Software,5.41%,tcs,TCS


In [4]:
def initialize(yr, urlsplit):
    '''
    Function to obtain total number of result pages, initialize blank news data and
    set urls for moneycontrol news search page for the input year 'yr'.
    '''
    global ticker, url_all, headlines, dates, news, urls, sources
    
    urlyr = "https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id=" + urlsplit + "&durationType=Y&Year={}"
    url = "https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id={}&scat=&pageno={}&next=0&durationType=Y&Year={}&duration=1&news_type="
    
    soup = BeautifulSoup(get(urlyr.format(yr)).text, 'lxml')
    ticker = soup.select('div.FL.gry10')[0].text.split('|')[1].split(':')[1].strip()
    result_max = len(soup.select('div.pages.MR10.MT15')[0].select('a')) + 1
    
    url_all = [url.format(urlsplit, i, yr) for i in range(1, result_max+1)]
    headlines, dates, news, urls, sources = [], [], [], [], []
    print("Total number of result pages for", ticker, "in the year", yr, ":", result_max)

In [5]:
def getnewslinks():
    '''
    Function to scrape news headlines, urls, publish dates etc.
    '''
    print("[INFO] Extracting Links...")

    for src in tqdm(url_all):

        try:
            soup = BeautifulSoup(get(src).text, 'lxml')

            # Extracts the Headlines
            try:
                headline = [script.text.strip() for script in soup.select('a.g_14bl')]
                headlines.extend(headline)
            except:
                print('Exception in Headline')
                headlines.extend(None)

            # Extracts the urls
            try:
                source = ["https://www.moneycontrol.com"+script.get('href') for script in soup.select('a.g_14bl')]
                urls.extend(source)
            except:
                print('Exception in url')
                urls.extend(None)

            # Extracts the published dates
            try:
                dateline = [str(parser.parse(script.text.split('|')[1].strip())).split()[0] for script in soup.select('p.PT3.a_10dgry')]
                dates.extend(dateline)
            except:
                print('Exception in dateline')
                dates.extend(None)

            # Extracts the bylines
            try:
                bylines = [script.select('span.a_2_10bl')[0].text.strip() if len(script.select('span.a_2_10bl'))==1 else None
                           for script in soup.select('p.PT3.a_10dgry')]
                sources.extend(bylines)
            except:
                print('Exception in bylines')
                sources.extend(None)

        except:
            print("Exception occurred in url : ", src)
            break

    print("[INFO] Links Extracted.")
    print("Total No. of Pages to be Scraped = ", len(urls))
    print("Oldest Available Article: ", min(dates))

In [6]:
def getarticles(thres=7):
    '''
    Function to scrape news articles. Any paragraph with words less than 'thres' will not be considered.
    '''
    print("[INFO] Extracting Articles...")

    for src in tqdm(urls):
        try:
            # Parse the url to NewsPage
            soup = BeautifulSoup(get(src).text, 'lxml')

            # Extracts the news articles
            try:
                news_article = '.'.join([scrape.text.strip() for scrape in soup.select("div.arti-flow")[0].select("p")
                                         if len(scrape.text.split()) >= thres])
                news.append(news_article)
            except:
                news.append(None)

        except:
            print("Exception occurred in url : ", src)
            news.append(None)

    print("[INFO] Articles Extracted.")

In [7]:
def chkdata():
    '''
    Function to check for any missing values in the Dataframe and drop it.
    '''
    global df
    df = pd.DataFrame({'Headlines': headlines,
                       'Articles': news,
                       'Published_Dates': dates,
                       'Source_URLs': urls,
                       'ByLines' : sources
                       })
    print("Missing Info in Scraped Data :")
    print(df.isna().sum())
    df=df.dropna(axis = 0)
    print("Total Usable Scraped Data : ", df.shape)

In [8]:
def savefile(tickr,yr):
    '''
    Function to save the scraped data as pickle file.
    '''
    # df.to_csv("news_mcontrol_"+ tickr + "_" + str(yr) + ".csv")
    df.to_pickle("news_mcontrol_"+ tickr + "_" + str(yr) + ".pkl")
    print("Data saved for", tickr, "for year",yr, ".")

In [None]:
# Scraping 2020 news articles for all the companies listed in Nifty50

yr = 2020

for i, sstring in enumerate(Substring):
    print("Nifty50 Extraction Search Count :",i+1)
    initialize(yr, sstring)
    getnewslinks()
    getarticles()
    chkdata()
    savefile(ticker, yr)
    time.sleep(5)

Nifty50 Extraction Search Count : 1


  0%|                                                                                            | 0/8 [00:00<?, ?it/s]

Total number of result pages for RELIANCE in the year 2020 : 8
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00,  1.66it/s]
  0%|                                                                                          | 0/150 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  150
Oldest Available Article:  2020-04-30
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 150/150 [05:49<00:00,  2.33s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (147, 5)
Data saved for RELIANCE for year 2020 .
Nifty50 Extraction Search Count : 2


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Total number of result pages for HDFCBANK in the year 2020 : 6
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.51it/s]
  0%|                                                                                          | 0/119 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  119
Oldest Available Article:  2020-02-20
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 119/119 [04:50<00:00,  2.44s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           5
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (114, 5)
Data saved for HDFCBANK for year 2020 .
Nifty50 Extraction Search Count : 3


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Total number of result pages for INFY in the year 2020 : 6
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.64it/s]
  0%|                                                                                          | 0/120 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  120
Oldest Available Article:  2020-03-03
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [04:40<00:00,  2.34s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           4
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (116, 5)
Data saved for INFY for year 2020 .
Nifty50 Extraction Search Count : 4


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for HDFC in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.46it/s]
  0%|                                                                                           | 0/57 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  57
Oldest Available Article:  2020-03-05
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 57/57 [02:22<00:00,  2.49s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (57, 5)
Data saved for HDFC for year 2020 .
Nifty50 Extraction Search Count : 5


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Total number of result pages for TCS in the year 2020 : 5
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:18<00:00,  3.66s/it]
  0%|                                                                                          | 0/100 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  100
Oldest Available Article:  2020-02-04
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [03:49<00:00,  2.30s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           5
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (95, 5)
Data saved for TCS for year 2020 .
Nifty50 Extraction Search Count : 6


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Total number of result pages for ICICIBANK in the year 2020 : 5
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.41it/s]
  0%|                                                                                           | 0/99 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  99
Oldest Available Article:  2020-04-01
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 99/99 [03:54<00:00,  2.37s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (99, 5)
Data saved for ICICIBANK for year 2020 .
Nifty50 Extraction Search Count : 7


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for KOTAKBANK in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.38it/s]
  0%|                                                                                           | 0/55 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  55
Oldest Available Article:  2020-04-23
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 55/55 [02:12<00:00,  2.41s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (52, 5)
Data saved for KOTAKBANK for year 2020 .
Nifty50 Extraction Search Count : 8


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Total number of result pages for HINDUNILVR in the year 2020 : 5
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.45it/s]
  0%|                                                                                          | 0/100 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  100
Oldest Available Article:  2020-03-23
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [03:27<00:00,  2.07s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           6
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (94, 5)
Data saved for HINDUNILVR for year 2020 .
Nifty50 Extraction Search Count : 9


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for ITC in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:12<00:00,  4.04s/it]
  0%|                                                                                           | 0/46 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  46
Oldest Available Article:  2020-04-14
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 46/46 [01:30<00:00,  1.96s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (46, 5)
Data saved for ITC for year 2020 .
Nifty50 Extraction Search Count : 10


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Total number of result pages for LT in the year 2020 : 5
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:28<00:00,  5.64s/it]
  0%|                                                                                           | 0/90 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  90
Oldest Available Article:  2020-03-30
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [03:38<00:00,  2.43s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           2
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (88, 5)
Data saved for LT for year 2020 .
Nifty50 Extraction Search Count : 11


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for AXISBANK in the year 2020 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:21<00:00,  5.47s/it]
  0%|                                                                                           | 0/73 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  73
Oldest Available Article:  2020-04-02
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 73/73 [03:09<00:00,  2.59s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           5
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (68, 5)
Data saved for AXISBANK for year 2020 .
Nifty50 Extraction Search Count : 12


  0%|                                                                                            | 0/7 [00:00<?, ?it/s]

Total number of result pages for BHARTIARTL in the year 2020 : 7
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:33<00:00,  4.76s/it]
  0%|                                                                                          | 0/140 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  140
Oldest Available Article:  2020-02-17
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 140/140 [05:38<00:00,  2.42s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           2
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (138, 5)
Data saved for BHARTIARTL for year 2020 .
Nifty50 Extraction Search Count : 13


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for ASIANPAINT in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:14<00:00,  4.82s/it]
  0%|                                                                                           | 0/52 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  52
Oldest Available Article:  2020-01-23
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 52/52 [01:58<00:00,  2.29s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           4
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (48, 5)
Data saved for ASIANPAINT for year 2020 .
Nifty50 Extraction Search Count : 14


  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

Total number of result pages for MARUTI in the year 2020 : 10
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:52<00:00,  5.20s/it]
  0%|                                                                                          | 0/200 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  200
Oldest Available Article:  2020-01-27
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [07:51<00:00,  2.36s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (197, 5)
Data saved for MARUTI for year 2020 .
Nifty50 Extraction Search Count : 15


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for HCLTECH in the year 2020 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:21<00:00,  5.27s/it]
  0%|                                                                                           | 0/75 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  75
Oldest Available Article:  2020-01-22
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 75/75 [02:53<00:00,  2.32s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           4
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (71, 5)
Data saved for HCLTECH for year 2020 .
Nifty50 Extraction Search Count : 16


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for BAJFINANCE in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:09<00:00,  4.94s/it]
  0%|                                                                                           | 0/26 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  26
Oldest Available Article:  2020-05-19
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [01:00<00:00,  2.34s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (26, 5)
Data saved for BAJFINANCE for year 2020 .
Nifty50 Extraction Search Count : 17


  0%|                                                                                           | 0/11 [00:00<?, ?it/s]

Total number of result pages for SBIN in the year 2020 : 11
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:58<00:00,  5.35s/it]
  0%|                                                                                          | 0/219 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  219
Oldest Available Article:  2020-03-06
[INFO] Extracting Articles...


 10%|████████▏                                                                        | 22/219 [00:52<07:32,  2.30s/it]

In [None]:
# Scraping 2020 news articles for all the companies listed in Nifty50

yr = 2020

for i, sstring in enumerate(Substring):
    if i>=16:
        print("Nifty50 Extraction Search Count :",i+1)
        initialize(yr, sstring)
        getnewslinks()
        getarticles()
        chkdata()
        savefile(ticker, yr)
        time.sleep(5)

Nifty50 Extraction Search Count : 17


  0%|                                                                                           | 0/11 [00:00<?, ?it/s]

Total number of result pages for SBIN in the year 2020 : 11
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:48<00:00,  4.43s/it]
  0%|                                                                                          | 0/219 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  219
Oldest Available Article:  2020-03-06
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 219/219 [08:49<00:00,  2.42s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           6
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (213, 5)
Data saved for SBIN for year 2020 .
Nifty50 Extraction Search Count : 18


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for DRREDDY in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:15<00:00,  5.15s/it]
  0%|                                                                                           | 0/60 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  60
Oldest Available Article:  2020-05-21
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [02:38<00:00,  2.64s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (57, 5)
Data saved for DRREDDY for year 2020 .
Nifty50 Extraction Search Count : 19


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Total number of result pages for M&M in the year 2020 : 5
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:26<00:00,  5.27s/it]
  0%|                                                                                           | 0/98 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  98
Oldest Available Article:  2020-02-11
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 98/98 [03:57<00:00,  2.43s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           4
Published_Dates    0
Source_URLs        0
ByLines            1
dtype: int64
Total Usable Scraped Data :  (93, 5)
Data saved for M&M for year 2020 .
Nifty50 Extraction Search Count : 20


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for NESTLEIND in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:09<00:00,  4.65s/it]
  0%|                                                                                           | 0/26 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  26
Oldest Available Article:  2020-05-12
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [01:01<00:00,  2.37s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           1
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (25, 5)
Data saved for NESTLEIND for year 2020 .
Nifty50 Extraction Search Count : 21


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for SUNPHARMA in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:16<00:00,  5.40s/it]
  0%|                                                                                           | 0/56 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  56
Oldest Available Article:  2020-05-23
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 56/56 [02:14<00:00,  2.40s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (53, 5)
Data saved for SUNPHARMA for year 2020 .
Nifty50 Extraction Search Count : 22


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for TITAN in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:06<00:00,  2.07s/it]
  0%|                                                                                           | 0/52 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  52
Oldest Available Article:  2020-02-07
[INFO] Extracting Articles...


  8%|██████▍                                                                            | 4/52 [00:09<01:57,  2.45s/it]

In [9]:
# Scraping 2020 news articles for all the companies listed in Nifty50

yr = 2020

for i, sstring in enumerate(Substring):
    if i>=21:
        print("Nifty50 Extraction Search Count :",i+1)
        initialize(yr, sstring)
        getnewslinks()
        getarticles()
        chkdata()
        savefile(ticker, yr)
        time.sleep(5)

Nifty50 Extraction Search Count : 22


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for TITAN in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:17<00:00,  5.77s/it]
  0%|                                                                                           | 0/54 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  54
Oldest Available Article:  2020-02-07
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 54/54 [02:50<00:00,  3.17s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           1
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (53, 5)
Data saved for TITAN for year 2020 .
Nifty50 Extraction Search Count : 23


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for TECHM in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:15<00:00,  5.27s/it]
  0%|                                                                                           | 0/49 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  49
Oldest Available Article:  2020-04-08
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 49/49 [02:34<00:00,  3.15s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (46, 5)
Data saved for TECHM for year 2020 .
Nifty50 Extraction Search Count : 24


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for ULTRACEMCO in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:15<00:00,  5.29s/it]
  0%|                                                                                           | 0/42 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  42
Oldest Available Article:  2020-05-20
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [02:00<00:00,  2.87s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            1
dtype: int64
Total Usable Scraped Data :  (38, 5)
Data saved for ULTRACEMCO for year 2020 .
Nifty50 Extraction Search Count : 25


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for WIPRO in the year 2020 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:21<00:00,  5.42s/it]
  0%|                                                                                           | 0/71 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  71
Oldest Available Article:  2020-04-07
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 71/71 [02:58<00:00,  2.51s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (68, 5)
Data saved for WIPRO for year 2020 .
Nifty50 Extraction Search Count : 26


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for BRITANNIA in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:15<00:00,  5.32s/it]
  0%|                                                                                           | 0/52 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  52
Oldest Available Article:  2020-03-26
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 52/52 [02:16<00:00,  2.62s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           2
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (50, 5)
Data saved for BRITANNIA for year 2020 .
Nifty50 Extraction Search Count : 27


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for HDFCLIFE in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.50s/it]
  0%|                                                                                           | 0/35 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  35
Oldest Available Article:  2020-03-18
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [01:08<00:00,  1.96s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (35, 5)
Data saved for HDFCLIFE for year 2020 .
Nifty50 Extraction Search Count : 28


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for POWERGRID in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.90s/it]
  0%|                                                                                           | 0/19 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  19
Oldest Available Article:  2020-01-14
[INFO] Extracting Articles...


 42%|██████████████████████████████████▉                                                | 8/19 [00:42<01:33,  8.46s/it]

Exception occurred in url :  https://www.moneycontrol.com/news/results/power-grid-corp-consolidated-march-2020-net-sales-at-rs-1014826-crore643-y-o-y_13934781.html


100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [01:11<00:00,  3.75s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           2
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (17, 5)
Data saved for POWERGRID for year 2020 .
Nifty50 Extraction Search Count : 29


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for NTPC in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:17<00:00,  5.77s/it]
  0%|                                                                                           | 0/60 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  60
Oldest Available Article:  2020-01-01
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [02:48<00:00,  2.81s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (60, 5)
Data saved for NTPC for year 2020 .
Nifty50 Extraction Search Count : 30


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Total number of result pages for HEROMOTOCO in the year 2020 : 5
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:12<00:00,  2.58s/it]
  0%|                                                                                           | 0/91 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  91
Oldest Available Article:  2020-03-23
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 91/91 [04:29<00:00,  2.96s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (88, 5)
Data saved for HEROMOTOCO for year 2020 .
Nifty50 Extraction Search Count : 31


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for CIPLA in the year 2020 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:11<00:00,  2.87s/it]
  0%|                                                                                           | 0/64 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  64
Oldest Available Article:  2020-03-26
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 64/64 [03:10<00:00,  2.98s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (61, 5)
Data saved for CIPLA for year 2020 .
Nifty50 Extraction Search Count : 32


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for DIVISLAB in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.57it/s]
  0%|                                                                                           | 0/22 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  22
Oldest Available Article:  2020-02-01
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [01:03<00:00,  2.90s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           1
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (21, 5)
Data saved for DIVISLAB for year 2020 .
Nifty50 Extraction Search Count : 33


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Total number of result pages for BAJAJ-AUTO in the year 2020 : 6
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:08<00:00,  1.43s/it]
  0%|                                                                                          | 0/106 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  106
Oldest Available Article:  2020-02-04
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 106/106 [06:13<00:00,  3.53s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           3
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (103, 5)
Data saved for BAJAJ-AUTO for year 2020 .
Nifty50 Extraction Search Count : 34


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for BAJAJFINSV in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.47it/s]
  0%|                                                                                           | 0/27 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  27
Oldest Available Article:  2020-01-29
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 27/27 [01:33<00:00,  3.45s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (27, 5)
Data saved for BAJAJFINSV for year 2020 .
Nifty50 Extraction Search Count : 35


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for SBILIFE in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.52it/s]
  0%|                                                                                           | 0/32 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  32
Oldest Available Article:  2020-01-22
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [01:33<00:00,  2.91s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (32, 5)
Data saved for SBILIFE for year 2020 .
Nifty50 Extraction Search Count : 36


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for EICHERMOT in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.39s/it]
  0%|                                                                                           | 0/45 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  45
Oldest Available Article:  2020-03-03
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 45/45 [01:53<00:00,  2.52s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           2
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (43, 5)
Data saved for EICHERMOT for year 2020 .
Nifty50 Extraction Search Count : 37


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for INDUSINDBK in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.20it/s]
  0%|                                                                                           | 0/46 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  46
Oldest Available Article:  2020-04-29
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 46/46 [01:47<00:00,  2.34s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           2
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (44, 5)
Data saved for INDUSINDBK for year 2020 .
Nifty50 Extraction Search Count : 38


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for GRASIM in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.57it/s]
  0%|                                                                                           | 0/21 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  21
Oldest Available Article:  2020-02-10
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [01:01<00:00,  2.91s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (21, 5)
Data saved for GRASIM for year 2020 .
Nifty50 Extraction Search Count : 39


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for BPCL in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:09<00:00,  3.01s/it]
  0%|                                                                                           | 0/49 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  49
Oldest Available Article:  2020-02-14
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 49/49 [02:36<00:00,  3.19s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (49, 5)
Data saved for BPCL for year 2020 .
Nifty50 Extraction Search Count : 40


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for JSWSTEEL in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:18<00:00,  6.04s/it]
  0%|                                                                                           | 0/60 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  60
Oldest Available Article:  2020-01-24
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [02:50<00:00,  2.84s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           2
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (58, 5)
Data saved for JSWSTEEL for year 2020 .
Nifty50 Extraction Search Count : 41


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for UPL in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:10<00:00,  5.47s/it]
  0%|                                                                                           | 0/23 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  23
Oldest Available Article:  2020-05-25
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [01:09<00:00,  3.03s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           2
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (21, 5)
Data saved for UPL for year 2020 .
Nifty50 Extraction Search Count : 42


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for SHREECEM in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:15<00:00,  7.63s/it]
  0%|                                                                                           | 0/29 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  29
Oldest Available Article:  2020-01-07
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [01:22<00:00,  2.85s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           2
Published_Dates    0
Source_URLs        0
ByLines            1
dtype: int64
Total Usable Scraped Data :  (26, 5)
Data saved for SHREECEM for year 2020 .
Nifty50 Extraction Search Count : 43


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for TATASTEEL in the year 2020 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:10<00:00,  2.55s/it]
  0%|                                                                                           | 0/77 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  77
Oldest Available Article:  2020-01-27
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 77/77 [03:41<00:00,  2.88s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (77, 5)
Data saved for TATASTEEL for year 2020 .
Nifty50 Extraction Search Count : 44


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for HINDALCO in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.17it/s]
  0%|                                                                                           | 0/32 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  32
Oldest Available Article:  2020-02-12
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [01:19<00:00,  2.50s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (32, 5)
Data saved for HINDALCO for year 2020 .
Nifty50 Extraction Search Count : 45


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for ADANIPORTS in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.54it/s]
  0%|                                                                                           | 0/30 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  30
Oldest Available Article:  2020-02-04
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:15<00:00,  2.52s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           1
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (29, 5)
Data saved for ADANIPORTS for year 2020 .
Nifty50 Extraction Search Count : 46


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for ONGC in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.73s/it]
  0%|                                                                                           | 0/54 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  54
Oldest Available Article:  2020-01-05
[INFO] Extracting Articles...


 80%|█████████████████████████████████████████████████████████████████▎                | 43/54 [01:54<00:42,  3.82s/it]

Exception occurred in url :  https://www.moneycontrol.com/news/business/ongc-starts-pumping-gasï»¿krishna-godavari-block_13588601.html


100%|██████████████████████████████████████████████████████████████████████████████████| 54/54 [02:26<00:00,  2.72s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           1
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (53, 5)
Data saved for ONGC for year 2020 .
Nifty50 Extraction Search Count : 47


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for COALINDIA in the year 2020 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.13it/s]
  0%|                                                                                           | 0/62 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  62
Oldest Available Article:  2020-05-16
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 62/62 [02:39<00:00,  2.57s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (62, 5)
Data saved for COALINDIA for year 2020 .
Nifty50 Extraction Search Count : 48


  0%|                                                                                            | 0/7 [00:00<?, ?it/s]

Total number of result pages for TATAMOTORS in the year 2020 : 7
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:04<00:00,  1.42it/s]
  0%|                                                                                          | 0/140 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  140
Oldest Available Article:  2020-02-05
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 140/140 [06:07<00:00,  2.63s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           2
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (138, 5)
Data saved for TATAMOTORS for year 2020 .
Nifty50 Extraction Search Count : 49


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for IOC in the year 2020 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.11it/s]
  0%|                                                                                           | 0/73 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  73
Oldest Available Article:  2020-01-13
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 73/73 [03:21<00:00,  2.76s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (73, 5)
Data saved for IOC for year 2020 .
Nifty50 Extraction Search Count : 50


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for GAIL in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.21it/s]
  0%|                                                                                           | 0/34 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  34
Oldest Available Article:  2020-01-17
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 34/34 [01:24<00:00,  2.48s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           0
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (34, 5)
Data saved for GAIL for year 2020 .


In [None]:
# The pages with blank Articles might have a different structure.
# There is a text '\xa0' in the articles that was '&nbsp;' in the html. This will be replaced by ' ' while preprocessing.
# Some pages have no byline. Of these, some have video articles.