In [1]:
from bs4 import BeautifulSoup
from requests import get
import pandas as pd
from tqdm import tqdm
from dateutil import parser
import string
import re
import time

In [3]:
# Scrape the latest Nifty50 Index Composition with previous day details, such as, company names, prices, change etc.

getnifty50 = "https://www.moneycontrol.com/stocks/marketstats/indexcomp.php?optex=NSE&opttopic=indexcomp&index=9"
soup = BeautifulSoup(get(getnifty50).text, 'lxml')
composition_n50 = soup.select('table.tbldata14.bdrtpg')[0]

Scrape_date = parser.parse(soup.select('div.FR.b_15.PT5')[0].text)
Company_Name = [script.text.strip() for script in composition_n50.select("a.bl_12")[2::2]]
Industry = [script.text.strip() for script in composition_n50.select("a.bl_12")[3::2]]
# urlsplit = [script.get('href').split('/')[-1] for script in composition_n50.select("a.bl_12")[2::2]]

Last_Price = [script.text.strip() for script in composition_n50.select('td.brdrgtgry')[2::6]]
Change = [script.text.strip() for script in composition_n50.select('td.brdrgtgry')[3::6]]
Change_percent = [script.text.strip() for script in composition_n50.select('td.brdrgtgry')[4::6]]
Mrk_Cap = [script.text.strip() for script in composition_n50.select('td.brdrgtgry')[5::6]]


nifty50_latest = pd.DataFrame({
    'Company_Name' : Company_Name,
    'Industry' : Industry,
#     'urlsplit' : urlsplit,
    'Last_Price' : Last_Price,
    'Change' : Change,
    'Change_percent' : Change_percent,
    'Mrk_Cap(Rs Cr)' : Mrk_Cap
})

nifty50_latest['Scrape_date'] = Scrape_date
print(nifty50_latest.shape)
nifty50_latest.to_csv("nifty50_latest.csv")
nifty50_latest.head()

(50, 7)


Unnamed: 0,Company_Name,Industry,Last_Price,Change,Change_percent,Mrk_Cap(Rs Cr),Scrape_date
0,Adani Ports,Transport Infrastructure,361.1,7.45,2.11,73366.56,2020-11-04 15:59:00
1,Asian Paints,Paints,2169.45,14.8,0.69,208093.16,2020-11-04 15:59:00
2,Axis Bank,Bank - Private,525.15,-9.0,-1.68,160705.88,2020-11-04 15:59:00
3,Bajaj Auto,Automobile - 2 & 3 Wheelers,2926.1,11.25,0.39,84671.68,2020-11-04 15:59:00
4,Bajaj Finance,Finance - NBFC,3561.45,70.65,2.02,214608.47,2020-11-04 15:59:00


In [2]:
# Lookup Table with Nifty50 stocks and MoneyControl url sub-strings
nifty50_lookuptable = pd.read_csv("nifty50_lookuptable.csv")
Substring = [i for i in nifty50_lookuptable['mcontrol_substring']]
# cnames = [i for i in nifty50_lookuptable['Company Name']]
print(nifty50_lookuptable.shape)
nifty50_lookuptable.head()

(50, 6)


Unnamed: 0,Sr.No.,Company Name,Sector,Weightage,thehindu_searchstring,mcontrol_substring
0,1,Reliance Industries Ltd.,Petroleum Products,14.93%,reliance%20petroleum,RI
1,2,HDFC Bank Ltd.,Banks,9.69%,hdfc%20bank,HDF01
2,3,Infosys Limited,Software,7.63%,infosys,IT
3,4,Housing Development Fin. Corp. Ltd.,Finance,6.44%,hdfc,HDF
4,5,Tata Consultancy Services Ltd.,Software,5.41%,tcs,TCS


In [5]:
def initialize(yr, urlsplit):
    '''
    Function to obtain total number of result pages, initialize blank news data and
    set urls for moneycontrol news search page for the input year 'yr'.
    '''
    global ticker, url_all, headlines, dates, news, urls, sources
    
    urlyr = "https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id=" + urlsplit + "&durationType=Y&Year={}"
    url = "https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id={}&scat=&pageno={}&next=0&durationType=Y&Year={}&duration=1&news_type="
    
    soup = BeautifulSoup(get(urlyr.format(yr)).text, 'lxml')
    ticker = soup.select('div.FL.gry10')[0].text.split('|')[1].split(':')[1].strip()
    result_max = len(soup.select('div.pages.MR10.MT15')[0].select('a')) + 1
    
    url_all = [url.format(urlsplit, i, yr) for i in range(1, result_max+1)]
    headlines, dates, news, urls, sources = [], [], [], [], []
    print("Total number of result pages for", ticker, "in the year", yr, ":", result_max)

In [6]:
def getnewslinks():
    '''
    Function to scrape news headlines, urls, publish dates etc.
    '''
    print("[INFO] Extracting Links...")

    for src in tqdm(url_all):

        try:
            soup = BeautifulSoup(get(src).text, 'lxml')

            # Extracts the Headlines
            try:
                headline = [script.text.strip() for script in soup.select('a.g_14bl')]
                headlines.extend(headline)
            except:
                print('Exception in Headline')
                headlines.extend(None)

            # Extracts the urls
            try:
                source = ["https://www.moneycontrol.com"+script.get('href') for script in soup.select('a.g_14bl')]
                urls.extend(source)
            except:
                print('Exception in url')
                urls.extend(None)

            # Extracts the published dates
            try:
                dateline = [str(parser.parse(script.text.split('|')[1].strip())).split()[0] for script in soup.select('p.PT3.a_10dgry')]
                dates.extend(dateline)
            except:
                print('Exception in dateline')
                dates.extend(None)

            # Extracts the bylines
            try:
                bylines = [script.select('span.a_2_10bl')[0].text.strip() if len(script.select('span.a_2_10bl'))==1 else None
                           for script in soup.select('p.PT3.a_10dgry')]
                sources.extend(bylines)
            except:
                print('Exception in bylines')
                sources.extend(None)

        except:
            print("Exception occurred in url : ", src)
            break

    print("[INFO] Links Extracted.")
    print("Total No. of Pages to be Scraped = ", len(urls))
    print("Oldest Available Article: ", min(dates))

In [39]:
def getarticles(thres=7):
    '''
    Function to scrape news articles. Any paragraph with words less than 'thres' will not be considered.
    '''
    print("[INFO] Extracting Articles...")

    for src in tqdm(urls):
        try:
            # Parse the url to NewsPage
            soup = BeautifulSoup(get(src).text, 'lxml')

            # Extracts the news articles
            try:
                news_article = '.'.join([scrape.text.strip() for scrape in soup.select("div.arti-flow")[0].select("p")
                                         if len(scrape.text.split()) >= thres])
                news.append(news_article)
            except:
                news.append(None)

        except:
            print("Exception occurred in url : ", src)
            news.append(None)

    print("[INFO] Articles Extracted.")

In [40]:
def chkdata():
    '''
    Function to check for any missing values in the Dataframe and drop it.
    '''
    global df
    df = pd.DataFrame({'Headlines': headlines,
                       'Articles': news,
                       'Published_Dates': dates,
                       'Source_URLs': urls,
                       'ByLines' : sources
                       })
    print("Missing Info in Scraped Data :")
    print(df.isna().sum())
    df=df.dropna(axis = 0)
    print("Total Usable Scraped Data : ", df.shape)

In [41]:
def savefile(tickr,yr):
    '''
    Function to save the scraped data as pickle file.
    '''
    # df.to_csv("news_mcontrol_"+ tickr + "_" + str(yr) + ".csv")
    df.to_pickle("news_mcontrol_"+ tickr + "_" + str(yr) + ".pkl")
    print("Data saved for", tickr, "for year",yr, ".")

In [42]:
# Scraping 2019 news articles for all the companies listed in Nifty50

yr = 2019

for i, sstring in enumerate(Substring):
    print("Nifty50 Extraction Search Count :",i+1)
    initialize(yr, sstring)
    getnewslinks()
    getarticles()
    chkdata()
    savefile(ticker, yr)
    time.sleep(5)
    break # sample

Nifty50 Extraction Search Count : 1


  0%|                                                                                            | 0/7 [00:00<?, ?it/s]

Total number of result pages for RELIANCE in the year 2019 : 7
[INFO] Extracting Links...


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:04<00:00,  1.44it/s]
  0%|                                                                                          | 0/138 [00:00<?, ?it/s]

[INFO] Links Extracted.
Total No. of Pages to be Scraped =  138
Oldest Available Article:  2019-01-06
[INFO] Extracting Articles...


100%|████████████████████████████████████████████████████████████████████████████████| 138/138 [05:30<00:00,  2.39s/it]


[INFO] Articles Extracted.
Missing Info in Scraped Data :
Headlines          0
Articles           7
Published_Dates    0
Source_URLs        0
ByLines            0
dtype: int64
Total Usable Scraped Data :  (131, 5)
Data saved for RELIANCE for year 2019 .


In [43]:
df.head()

Unnamed: 0,Headlines,Articles,Published_Dates,Source_URLs,ByLines
0,"Six of top-10 cos lose Rs 64,419cr in m-cap; R...",Six of the 10 most valued Indian companies suf...,2019-12-29,https://www.moneycontrol.com/news/business/six...,PTI
1,Mukesh Ambani's Reliance Retail valued at $34 ...,Richest Indian Mukesh Ambani's Reliance Retail...,2019-12-26,https://www.moneycontrol.com/news/business/muk...,PTI
2,"Reliance tears into govt affidavit, says no fi...",Reliance Industries has mounted a strong count...,2019-12-23,https://www.moneycontrol.com/news/business/rel...,PTI
3,Eight of top-10 cos add Rs 1.13 lakh crore in ...,Eight of the 10 most valued domestic companies...,2019-12-22,https://www.moneycontrol.com/news/business/eig...,PTI
4,Shell-Reliance give up Panna-Mukta fields; wes...,After operating Panna-Mukta oil and gas fields...,2019-12-20,https://www.moneycontrol.com/news/business/she...,PTI


In [50]:
df.loc[0][1]

"Six of the 10 most valued Indian companies suffered a combined erosion of Rs 64,419.10 crore in market valuation last week, with RIL taking the biggest knock..While TCS, HDFC Bank, Kotak Mahindra Bank, SBI and ITC were the other firms which witnessed a decline in their market capitalisation (m-cap) for the week ended Friday, HDFC, HUL, ICICI Bank and Infosys emerged as gainers..The valuation of Reliance Industries Limited (RIL) tumbled Rs 36,291.90 crore to Rs 9,77,600.27 crore..HDFC Bank's market cap plunged Rs 11,666.10 crore to Rs 6,98,266.18 crore and that of Tata Consultancy Services (TCS)\xa0tanked Rs 9,155.82\xa0crore to Rs 8,24,830.44 crore..Slideshow | 2019 Recap: 10 companies that saw largest m-cap increase.The m-cap of ITC fell Rs 5,241.22 crore to Rs 2,91,238.23 crore and that of Kotak Mahindra Bank slipped Rs 1,528.55 crore to Rs 3,21,960.76 crore..SBI's valuation dipped Rs 535.48 crore to Rs 3,00,982.52 crore..In contrast, Housing Development Finance Corporation (HDFC) t

In [None]:
# The pages with blank Articles might have a different structure.
# There is a text '\xa0' in the articles that was '&nbsp;' in the html. This will be replaced by ' ' while preprocessing.
# Some pages have no byline. Of these, some have video articles.