In [1]:
from bs4 import BeautifulSoup
from requests import get
import pandas as pd
from tqdm import tqdm
from dateutil import parser
import string
import re
import time

In [2]:
# Scrape the latest Nifty50 Index Composition with previous day details, such as, company names, prices, change etc.

getnifty50 = "https://www.moneycontrol.com/stocks/marketstats/indexcomp.php?optex=NSE&opttopic=indexcomp&index=9"
soup = BeautifulSoup(get(getnifty50).text, 'lxml')
composition_n50 = soup.select('table.tbldata14.bdrtpg')[0]

Scrape_date = parser.parse(soup.select('div.FR.b_15.PT5')[0].text)
Company_Name = [script.text.strip() for script in composition_n50.select("a.bl_12")[2::2]]
Industry = [script.text.strip() for script in composition_n50.select("a.bl_12")[3::2]]
# urlsplit = [script.get('href').split('/')[-1] for script in composition_n50.select("a.bl_12")[2::2]]

Last_Price = [script.text.strip() for script in composition_n50.select('td.brdrgtgry')[2::6]]
Change = [script.text.strip() for script in composition_n50.select('td.brdrgtgry')[3::6]]
Change_percent = [script.text.strip() for script in composition_n50.select('td.brdrgtgry')[4::6]]
Mrk_Cap = [script.text.strip() for script in composition_n50.select('td.brdrgtgry')[5::6]]


nifty50_latest = pd.DataFrame({
    'Company_Name' : Company_Name,
    'Industry' : Industry,
#     'urlsplit' : urlsplit,
    'Last_Price' : Last_Price,
    'Change' : Change,
    'Change_percent' : Change_percent,
    'Mrk_Cap(Rs Cr)' : Mrk_Cap
})

nifty50_latest['Scrape_date'] = Scrape_date
print(nifty50_latest.shape)
nifty50_latest.to_csv("nifty50_latest.csv")
nifty50_latest.head()

(50, 7)


Unnamed: 0,Company_Name,Industry,Last_Price,Change,Change_percent,Mrk_Cap(Rs Cr),Scrape_date
0,Adani Ports,Transport Infrastructure,353.65,-3.25,-0.91,71852.9,2020-11-03 15:59:00
1,Asian Paints,Paints,2154.65,-18.2,-0.84,206673.55,2020-11-03 15:59:00
2,Axis Bank,Bank - Private,534.15,11.5,2.2,163459.24,2020-11-03 15:59:00
3,Bajaj Auto,Automobile - 2 & 3 Wheelers,2914.85,71.15,2.5,84346.15,2020-11-03 15:59:00
4,Bajaj Finance,Finance - NBFC,3490.8,71.0,2.08,210351.19,2020-11-03 15:59:00


In [3]:
# Lookup Table with Nifty50 stocks and MoneyControl url sub-strings
nifty50_lookuptable = pd.read_csv("nifty50_lookuptable.csv")
Substring = [i for i in nifty50_lookuptable['mcontrol_substring']]
# cnames = [i for i in nifty50_lookuptable['Company Name']]
print(nifty50_lookuptable.shape)
nifty50_lookuptable.head()

(50, 6)


Unnamed: 0,Sr.No.,Company Name,Sector,Weightage,thehindu_searchstring,mcontrol_substring
0,1,Reliance Industries Ltd.,Petroleum Products,14.93%,reliance%20petroleum,RI
1,2,HDFC Bank Ltd.,Banks,9.69%,hdfc%20bank,HDF01
2,3,Infosys Limited,Software,7.63%,infosys,IT
3,4,Housing Development Fin. Corp. Ltd.,Finance,6.44%,hdfc,HDF
4,5,Tata Consultancy Services Ltd.,Software,5.41%,tcs,TCS


In [4]:
def initialize(yr, urlsplit):
    '''
    Function to obtain total number of result pages, initialize blank news data and
    set urls for moneycontrol news search page for the input year 'yr'.
    '''
    global ticker, url_all, headlines, dates, news, urls, sources
    
    urlyr = "https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id=" + urlsplit + "&durationType=Y&Year={}"
    url = "https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id={}&scat=&pageno={}&next=0&durationType=Y&Year={}&duration=1&news_type="
    
    soup = BeautifulSoup(get(urlyr.format(yr)).text, 'lxml')
    ticker = soup.select('div.FL.gry10')[0].text.split('|')[1].split(':')[1].strip()
    result_max = len(soup.select('div.pages.MR10.MT15')[0].select('a')) + 1
    
    url_all = [url.format(urlsplit, i, yr) for i in range(1, result_max+1)]
    headlines, dates, news, urls, sources = [], [], [], [], []
    print("Total number of result pages for", ticker, "in the year", yr, ":", result_max)

In [13]:
def getnewslinks():
    '''
    Function to scrape news headlines, urls, publish dates etc.
    '''
    print("[INFO] Extracting Links...")

    for src in tqdm(url_all):

        try:
            soup = BeautifulSoup(get(src).text, 'lxml')

            # Extracts the Headlines
            try:
                headline = [script.text.strip() for script in soup.select('a.g_14bl')]
                headlines.extend(headline)
            except:
                print('Exception in Headline')
                headlines.extend(None)

            # Extracts the urls
            try:
                source = ["https://www.moneycontrol.com"+script.get('href') for script in soup.select('a.g_14bl')]
                urls.extend(source)
            except:
                print('Exception in url')
                urls.extend(None)

            # Extracts the published dates
            try:
                dateline = [str(parser.parse(script.text.split('|')[1].strip())).split()[0] for script in soup.select('p.PT3.a_10dgry')]
                dates.extend(dateline)
            except:
                print('Exception in dateline')
                dates.extend(None)

            # Extracts the bylines
            try:
                bylines = [script.select('span.a_2_10bl')[0].text.strip() if len(script.select('span.a_2_10bl'))==1 else None
                           for script in soup.select('p.PT3.a_10dgry')]
                sources.extend(bylines)
            except:
                print('Exception in bylines')
                sources.extend(None)

        except:
            print("Exception occurred in url : ", src)
            break

    print("[INFO] Links Extracted.")
    print("Total No. of Pages to be Scraped = ", len(urls))
    print("Oldest Available Article: ", min(dates))

In [14]:
def getarticles(thres=20):
    '''
    Function to scrape news articles. Any paragraph with words less than 'thres' will not be considered.
    '''
    print("[INFO] Extracting Articles...")

    for src in tqdm(urls):
        try:
            # Parse the url to NewsPage
            soup = BeautifulSoup(get(src).text, 'lxml')

            # Extracts the news articles
            try:
                news_article = '.'.join([scrape.text.strip() for scrape in soup.select("div.arti-flow")[0].select("p")
                                         if scrape.text.split() >= thres])
                news.append(news_article)
            except:
                news.append(None)

        except:
            print("Exception occurred in url : ", src)
            news.append(None)

    print("[INFO] Articles Extracted.")

In [15]:
def chkdata():
    '''
    Function to check for any missing values in the Dataframe and drop it.
    '''
    global df
    df = pd.DataFrame({'Headlines': headlines,
                       'Articles': news,
                       'Published_Dates': dates,
                       'Source_URLs': urls,
                       'ByLines' : sources
                       })
    print("Missing Info in Scraped Data :")
    print(df.isna().sum())
    df=df.dropna(axis = 0)
    print("Total Usable Scraped Data : ", df.shape)

In [16]:
def savefile(tickr,yr):
    '''
    Function to save the scraped data as pickle file.
    '''
    # df.to_csv("news_mcontrol_"+ tickr + "_" + str(yr) + ".csv")
    df.to_pickle("news_mcontrol_"+ tickr + "_" + str(yr) + ".pkl")
    print("Data saved for", tickr, "for year",yr, ".")

In [12]:
# Scraping 2020 news articles for all the companies listed in Nifty50

yr = 2020

for i, sstring in enumerate(Substring):
    print("Nifty50 Extraction Search Count :",i+1)
    initialize(yr, sstring)
    getnewslinks()
    getarticles()
    chkdata()
    savefile(ticker, yr)
    time.sleep(5)

Nifty50 Extraction Search Count : 1


  0%|                                                                                            | 0/8 [00:00<?, ?it/s]

Total number of result pages for RELIANCE in the year 2020 : 8
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:24<00:00,  3.10s/it]
  0%|                                                                                          | 0/155 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  155
Oldest Available Article:  2020-05-04
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 155/155 [04:51<00:00,  1.88s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines            0
Articles           155
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for RELIANCE for year 2020 .
Nifty50 Extraction Search Count : 2


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Total number of result pages for HDFCBANK in the year 2020 : 6
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:15<00:00,  2.60s/it]
  0%|                                                                                          | 0/119 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  119
Oldest Available Article:  2020-01-28
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 119/119 [03:49<00:00,  1.93s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines            0
Articles           119
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for HDFCBANK for year 2020 .
Nifty50 Extraction Search Count : 3


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Total number of result pages for INFY in the year 2020 : 6
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:23<00:00,  3.99s/it]
  0%|                                                                                          | 0/120 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  120
Oldest Available Article:  2020-03-03
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [04:52<00:00,  2.44s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines            0
Articles           120
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for INFY for year 2020 .
Nifty50 Extraction Search Count : 4


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for HDFC in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:16<00:00,  5.43s/it]
  0%|                                                                                           | 0/57 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  57
Oldest Available Article:  2020-03-05
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 57/57 [01:57<00:00,  2.06s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           57
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for HDFC for year 2020 .
Nifty50 Extraction Search Count : 5


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Total number of result pages for TCS in the year 2020 : 5
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:22<00:00,  4.57s/it]
  0%|                                                                                          | 0/100 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  100
Oldest Available Article:  2020-02-04
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [03:29<00:00,  2.10s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines            0
Articles           100
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for TCS for year 2020 .
Nifty50 Extraction Search Count : 6


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Total number of result pages for ICICIBANK in the year 2020 : 5
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:19<00:00,  3.92s/it]
  0%|                                                                                           | 0/97 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  97
Oldest Available Article:  2020-04-01
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 97/97 [03:16<00:00,  2.03s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           96
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (1, 5)
Data saved for ICICIBANK for year 2020 .
Nifty50 Extraction Search Count : 7


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for KOTAKBANK in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:15<00:00,  5.01s/it]
  0%|                                                                                           | 0/55 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  55
Oldest Available Article:  2020-04-23
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 55/55 [02:06<00:00,  2.30s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           54
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (1, 5)
Data saved for KOTAKBANK for year 2020 .
Nifty50 Extraction Search Count : 8


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Total number of result pages for HINDUNILVR in the year 2020 : 5
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:27<00:00,  5.50s/it]
  0%|                                                                                          | 0/100 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  100
Oldest Available Article:  2020-03-23
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [04:11<00:00,  2.52s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           98
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (2, 5)
Data saved for HINDUNILVR for year 2020 .
Nifty50 Extraction Search Count : 9


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for ITC in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:16<00:00,  5.43s/it]
  0%|                                                                                           | 0/46 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  46
Oldest Available Article:  2020-04-14
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 46/46 [01:15<00:00,  1.64s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           45
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (1, 5)
Data saved for ITC for year 2020 .
Nifty50 Extraction Search Count : 10


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Total number of result pages for LT in the year 2020 : 5
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:20<00:00,  4.19s/it]
  0%|                                                                                          | 0/100 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  100
Oldest Available Article:  2020-01-27
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:45<00:00,  1.65s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           99
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (1, 5)
Data saved for LT for year 2020 .
Nifty50 Extraction Search Count : 11


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for AXISBANK in the year 2020 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:16<00:00,  4.07s/it]
  0%|                                                                                           | 0/69 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  69
Oldest Available Article:  2020-04-02
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 69/69 [01:57<00:00,  1.70s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           69
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for AXISBANK for year 2020 .
Nifty50 Extraction Search Count : 12


  0%|                                                                                            | 0/7 [00:00<?, ?it/s]

Total number of result pages for BHARTIARTL in the year 2020 : 7
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:28<00:00,  4.14s/it]
  0%|                                                                                          | 0/138 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  138
Oldest Available Article:  2020-02-17
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 138/138 [05:39<00:00,  2.46s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines            0
Articles           137
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64
Total Usable Scraped Data :  (1, 5)
Data saved for BHARTIARTL for year 2020 .
Nifty50 Extraction Search Count : 13


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for ASIANPAINT in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:15<00:00,  5.01s/it]
  0%|                                                                                           | 0/52 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  52
Oldest Available Article:  2020-01-23
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 52/52 [02:14<00:00,  2.59s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           52
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for ASIANPAINT for year 2020 .
Nifty50 Extraction Search Count : 14


  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

Total number of result pages for MARUTI in the year 2020 : 10
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:54<00:00,  5.44s/it]
  0%|                                                                                          | 0/199 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  199
Oldest Available Article:  2020-01-22
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 199/199 [06:33<00:00,  1.98s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines            0
Articles           197
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64
Total Usable Scraped Data :  (2, 5)
Data saved for MARUTI for year 2020 .
Nifty50 Extraction Search Count : 15


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for HCLTECH in the year 2020 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:15<00:00,  3.93s/it]
  0%|                                                                                           | 0/75 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  75
Oldest Available Article:  2020-01-22
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 75/75 [02:02<00:00,  1.64s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           75
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for HCLTECH for year 2020 .
Nifty50 Extraction Search Count : 16


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for BAJFINANCE in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.45s/it]
  0%|                                                                                           | 0/26 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  26
Oldest Available Article:  2020-05-19
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:43<00:00,  1.66s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           26
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for BAJFINANCE for year 2020 .
Nifty50 Extraction Search Count : 17


  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

Total number of result pages for SBIN in the year 2020 : 10
[INFO] Extracting Links...


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:45<00:00,  4.52s/it]
  0%|                                                                                          | 0/193 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  193
Oldest Available Article:  2020-03-16
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 193/193 [06:24<00:00,  1.99s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines            0
Articles           193
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for SBIN for year 2020 .
Nifty50 Extraction Search Count : 18


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for DRREDDY in the year 2020 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:20<00:00,  5.11s/it]
  0%|                                                                                           | 0/78 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  78
Oldest Available Article:  2020-02-10
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 78/78 [03:08<00:00,  2.42s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           78
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for DRREDDY for year 2020 .
Nifty50 Extraction Search Count : 19


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Total number of result pages for M&M in the year 2020 : 5
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:25<00:00,  5.17s/it]
  0%|                                                                                           | 0/99 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  99
Oldest Available Article:  2020-02-11
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 99/99 [04:00<00:00,  2.43s/it]

[INFO] Articles Extracted.





ValueError: arrays must all be same length

In [18]:
# Scraping 2020 news articles for all the companies listed in Nifty50

yr = 2020

for i, sstring in enumerate(Substring):
    if i==18:
        print("Nifty50 Extraction Search Count :",i+1)
        initialize(yr, sstring)
        getnewslinks()
        getarticles()
        chkdata()
        savefile(ticker, yr)
        time.sleep(5)

Nifty50 Extraction Search Count : 19


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Total number of result pages for M&M in the year 2020 : 5
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:21<00:00,  4.29s/it]
  0%|                                                                                           | 0/99 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  99
Oldest Available Article:  2020-02-11
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 99/99 [02:55<00:00,  1.77s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           96
Published_Dates     0
Source_URLs         0
ByLines             1
dtype: int64
Total Usable Scraped Data :  (3, 5)
Data saved for M&M for year 2020 .


In [17]:
# Scraping 2020 news articles for all the companies listed in Nifty50

yr = 2020

for i, sstring in enumerate(Substring):
    if i>=19:
        print("Nifty50 Extraction Search Count :",i+1)
        initialize(yr, sstring)
        getnewslinks()
        getarticles()
        chkdata()
        savefile(ticker, yr)
        time.sleep(5)

Nifty50 Extraction Search Count : 20


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for NESTLEIND in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:07<00:00,  3.73s/it]
  0%|                                                                                           | 0/26 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  26
Oldest Available Article:  2020-05-12
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:44<00:00,  1.73s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           26
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for NESTLEIND for year 2020 .
Nifty50 Extraction Search Count : 21


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for SUNPHARMA in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:11<00:00,  3.80s/it]
  0%|                                                                                           | 0/52 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  52
Oldest Available Article:  2020-05-23
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 52/52 [02:00<00:00,  2.32s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           52
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for SUNPHARMA for year 2020 .
Nifty50 Extraction Search Count : 22


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for TITAN in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:18<00:00,  6.29s/it]
  0%|                                                                                           | 0/52 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  52
Oldest Available Article:  2020-02-07
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 52/52 [02:13<00:00,  2.57s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           52
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for TITAN for year 2020 .
Nifty50 Extraction Search Count : 23


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for TECHM in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:18<00:00,  6.17s/it]
  0%|                                                                                           | 0/48 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  48
Oldest Available Article:  2020-04-08
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [02:02<00:00,  2.55s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           48
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for TECHM for year 2020 .
Nifty50 Extraction Search Count : 24


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for ULTRACEMCO in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:16<00:00,  5.60s/it]
  0%|                                                                                           | 0/42 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  42
Oldest Available Article:  2020-05-20
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [01:48<00:00,  2.59s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           42
Published_Dates     0
Source_URLs         0
ByLines             1
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for ULTRACEMCO for year 2020 .
Nifty50 Extraction Search Count : 25


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for WIPRO in the year 2020 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:20<00:00,  5.23s/it]
  0%|                                                                                           | 0/71 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  71
Oldest Available Article:  2020-04-07
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 71/71 [03:28<00:00,  2.93s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           71
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for WIPRO for year 2020 .
Nifty50 Extraction Search Count : 26


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for BRITANNIA in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:17<00:00,  5.82s/it]
  0%|                                                                                           | 0/52 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  52
Oldest Available Article:  2020-03-26
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 52/52 [02:19<00:00,  2.67s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           52
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for BRITANNIA for year 2020 .
Nifty50 Extraction Search Count : 27


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for HDFCLIFE in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:11<00:00,  5.57s/it]
  0%|                                                                                           | 0/34 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  34
Oldest Available Article:  2020-03-18
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 34/34 [01:31<00:00,  2.69s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           33
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (1, 5)
Data saved for HDFCLIFE for year 2020 .
Nifty50 Extraction Search Count : 28


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for POWERGRID in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:13<00:00,  6.82s/it]
  0%|                                                                                           | 0/20 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  20
Oldest Available Article:  2020-01-14
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:51<00:00,  2.58s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           20
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for POWERGRID for year 2020 .
Nifty50 Extraction Search Count : 29


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for NTPC in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:16<00:00,  5.64s/it]
  0%|                                                                                           | 0/60 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  60
Oldest Available Article:  2020-01-01
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [02:42<00:00,  2.70s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           60
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for NTPC for year 2020 .
Nifty50 Extraction Search Count : 30


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Total number of result pages for HEROMOTOCO in the year 2020 : 5
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:28<00:00,  5.74s/it]
  0%|                                                                                           | 0/88 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  88
Oldest Available Article:  2020-03-23
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 88/88 [03:39<00:00,  2.50s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           88
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for HEROMOTOCO for year 2020 .
Nifty50 Extraction Search Count : 31


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for CIPLA in the year 2020 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:21<00:00,  5.49s/it]
  0%|                                                                                           | 0/64 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  64
Oldest Available Article:  2020-03-26
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 64/64 [02:34<00:00,  2.41s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           63
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (1, 5)
Data saved for CIPLA for year 2020 .
Nifty50 Extraction Search Count : 32


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for DIVISLAB in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:15<00:00,  7.73s/it]
  0%|                                                                                           | 0/22 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  22
Oldest Available Article:  2020-02-01
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:52<00:00,  2.37s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           22
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for DIVISLAB for year 2020 .
Nifty50 Extraction Search Count : 33


  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Total number of result pages for BAJAJ-AUTO in the year 2020 : 6
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:32<00:00,  5.47s/it]
  0%|                                                                                          | 0/106 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  106
Oldest Available Article:  2020-02-04
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 106/106 [04:20<00:00,  2.45s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines            0
Articles           105
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64
Total Usable Scraped Data :  (1, 5)
Data saved for BAJAJ-AUTO for year 2020 .
Nifty50 Extraction Search Count : 34


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for BAJAJFINSV in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:13<00:00,  6.74s/it]
  0%|                                                                                           | 0/27 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  27
Oldest Available Article:  2020-01-29
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 27/27 [01:01<00:00,  2.28s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           27
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for BAJAJFINSV for year 2020 .
Nifty50 Extraction Search Count : 35


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for SBILIFE in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:13<00:00,  6.75s/it]
  0%|                                                                                           | 0/32 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  32
Oldest Available Article:  2020-01-22
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [01:21<00:00,  2.54s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           32
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for SBILIFE for year 2020 .
Nifty50 Extraction Search Count : 36


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for EICHERMOT in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:16<00:00,  5.37s/it]
  0%|                                                                                           | 0/45 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  45
Oldest Available Article:  2020-03-03
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 45/45 [01:47<00:00,  2.38s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           42
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (3, 5)
Data saved for EICHERMOT for year 2020 .
Nifty50 Extraction Search Count : 37


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for INDUSINDBK in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:16<00:00,  5.47s/it]
  0%|                                                                                           | 0/46 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  46
Oldest Available Article:  2020-04-29
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 46/46 [01:45<00:00,  2.30s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           46
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for INDUSINDBK for year 2020 .
Nifty50 Extraction Search Count : 38


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for GRASIM in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:14<00:00,  7.25s/it]
  0%|                                                                                           | 0/21 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  21
Oldest Available Article:  2020-02-10
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [01:28<00:00,  4.22s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           21
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for GRASIM for year 2020 .
Nifty50 Extraction Search Count : 39


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for BPCL in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:16<00:00,  5.63s/it]
  0%|                                                                                           | 0/49 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  49
Oldest Available Article:  2020-02-14
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 49/49 [02:03<00:00,  2.51s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           49
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for BPCL for year 2020 .
Nifty50 Extraction Search Count : 40


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for JSWSTEEL in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:18<00:00,  6.17s/it]
  0%|                                                                                           | 0/60 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  60
Oldest Available Article:  2020-01-24
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [02:38<00:00,  2.64s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           60
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for JSWSTEEL for year 2020 .
Nifty50 Extraction Search Count : 41


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for UPL in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:10<00:00,  5.38s/it]
  0%|                                                                                           | 0/22 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  22
Oldest Available Article:  2020-05-25
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:59<00:00,  2.70s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           22
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for UPL for year 2020 .
Nifty50 Extraction Search Count : 42


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for SHREECEM in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:16<00:00,  8.30s/it]
  0%|                                                                                           | 0/29 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  29
Oldest Available Article:  2020-01-07
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [01:39<00:00,  3.43s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           29
Published_Dates     0
Source_URLs         0
ByLines             1
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for SHREECEM for year 2020 .
Nifty50 Extraction Search Count : 43


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for TATASTEEL in the year 2020 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:23<00:00,  5.96s/it]
  0%|                                                                                           | 0/77 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  77
Oldest Available Article:  2020-01-27
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 77/77 [03:08<00:00,  2.45s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           77
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for TATASTEEL for year 2020 .
Nifty50 Extraction Search Count : 44


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for HINDALCO in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:14<00:00,  7.10s/it]
  0%|                                                                                           | 0/32 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  32
Oldest Available Article:  2020-02-12
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [01:14<00:00,  2.32s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           32
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for HINDALCO for year 2020 .
Nifty50 Extraction Search Count : 45


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for ADANIPORTS in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:11<00:00,  5.86s/it]
  0%|                                                                                           | 0/30 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  30
Oldest Available Article:  2020-02-04
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:19<00:00,  2.66s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           30
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for ADANIPORTS for year 2020 .
Nifty50 Extraction Search Count : 46


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Total number of result pages for ONGC in the year 2020 : 3
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:17<00:00,  5.91s/it]
  0%|                                                                                           | 0/54 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  54
Oldest Available Article:  2020-01-05
[INFO] Extracting Articles...


 80%|█████████████████████████████████████████████████████████████████▎                | 43/54 [01:45<00:30,  2.74s/it]

Exception occurred in url :  https://www.moneycontrol.com/news/business/ongc-starts-pumping-gasï»¿krishna-godavari-block_13588601.html


100%|██████████████████████████████████████████████████████████████████████████████████| 54/54 [02:09<00:00,  2.40s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           54
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for ONGC for year 2020 .
Nifty50 Extraction Search Count : 47


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for COALINDIA in the year 2020 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:22<00:00,  5.68s/it]
  0%|                                                                                           | 0/62 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  62
Oldest Available Article:  2020-05-16
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 62/62 [03:23<00:00,  3.28s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           62
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for COALINDIA for year 2020 .
Nifty50 Extraction Search Count : 48


  0%|                                                                                            | 0/7 [00:00<?, ?it/s]

Total number of result pages for TATAMOTORS in the year 2020 : 7
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:37<00:00,  5.30s/it]
  0%|                                                                                          | 0/140 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  140
Oldest Available Article:  2020-02-05
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 140/140 [05:29<00:00,  2.36s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines            0
Articles           139
Published_Dates      0
Source_URLs          0
ByLines              0
dtype: int64
Total Usable Scraped Data :  (1, 5)
Data saved for TATAMOTORS for year 2020 .
Nifty50 Extraction Search Count : 49


  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Total number of result pages for IOC in the year 2020 : 4
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:21<00:00,  5.39s/it]
  0%|                                                                                           | 0/73 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  73
Oldest Available Article:  2020-01-13
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 73/73 [04:39<00:00,  3.83s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           73
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for IOC for year 2020 .
Nifty50 Extraction Search Count : 50


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Total number of result pages for GAIL in the year 2020 : 2
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:10<00:00,  5.39s/it]
  0%|                                                                                           | 0/34 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  34
Oldest Available Article:  2020-01-17
[INFO] Extracting Articles...


100%|██████████████████████████████████████████████████████████████████████████████████| 34/34 [01:10<00:00,  2.08s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines           0
Articles           34
Published_Dates     0
Source_URLs         0
ByLines             0
dtype: int64
Total Usable Scraped Data :  (0, 5)
Data saved for GAIL for year 2020 .


In [None]:
# The pages with blank Articles, if present, might have a different structure.
# Some pages have no byline. Of these, some have video articles.