In [23]:
import bs4
import requests
import pandas as pd


def profit_and_loss(company_info, type_num, format_num, count):
    # Company_Info: A list containing full form and short form notations of company
    # Type_num: Consolidated: 1 / Standalone: 0
    # Format_num: New: 1 / Old: 0
    if type_num == 1:
        type_string = 'consolidated-'
    else:
        type_string = ''
    
    if format_num == 1:
        format_string = 'VI'
    else:
        format_string = ''

    if count == 0:
        url = ('https://www.moneycontrol.com/financials/' + company_info[0] + '/' + type_string + 'profit-loss' + format_string + '/' + company_info[1] +
        '#' + company_info[1])
    else:
        url = ('https://www.moneycontrol.com/financials/' + company_info[0] + '/' + type_string + 'profit-loss' + format_string + '/' + company_info[1] +
        '/' + str(count + 1) + '#' + company_info[1])

    # Example of webpages
    # Webpage 1: https://www.moneycontrol.com/financials/larsen&toubro/consolidated-profit-lossVI/LT#LT
    # Webpage 2: https://www.moneycontrol.com/financials/larsen&toubro/consolidated-profit-lossVI/LT/2#LT
    # Webpage 3: https://www.moneycontrol.com/financials/larsen&toubro/consolidated-profit-lossVI/LT/3#LT

    res = requests.get(url)
    try:
        res.raise_for_status()
    except:
        return 'Webpage not valid'

    soup = bs4.BeautifulSoup(res.content, 'html.parser')
    # Arriving at table of profit and loss statement with classes and ids
    table_data = soup.find(class_ = 'tab-pane fade active in', id = 'standalone-new')
    if table_data == None:
        return 'Webpage not valid'
    table_body = table_data.find(class_ = 'mctable1')


    table_for_columns = table_data.find_all('tr')
    list_of_columns = []
    for i in table_for_columns:
        list_of_columns.append(i.find('td').text)
    list_of_columns = [i for i in list_of_columns if i != u'\xa0']

    # Each element of table_body_list = row of profit and loss statement
    table_body_list = table_body.find_all('td')
    list_of_elements = []
    for i in table_body_list:
        list_of_elements.append(i.text)
    list_of_elements = [i for i in list_of_elements if i != u'\xa0']

    # Finding number of columns
    num_cols = 0
    for i in list_of_elements:
        if i.startswith('Mar') and len(i) == 6:
            num_cols += 1
            

    # Renaming first column name (in both list_of_columns and list_of_elements) so that data from multiple companies can be clubbed together
    for i in range(len(list_of_columns)):
        if list_of_columns[i].startswith('Profit & Loss account of'):
            list_of_columns[i] = 'Profit & Loss Statement of Year'
            break        
    for i in range(len(list_of_elements)):
        if list_of_elements[i].startswith('Profit & Loss account of'):
            list_of_elements[i] = 'Profit & Loss Statement of Year'
            break

    dict_data = {}
    for i in list_of_columns:
        if i in list_of_elements:
            num = list_of_elements.index(i)
            dict_data[i] = list_of_elements[num + 1: num + num_cols + 1]

    # Adding company name to dict_data
    length = len(dict_data['Profit & Loss Statement of Year'])
    dict_data['Company'] = length * [company_info[0]]

    # Adding type of profit and loss statement to dict_data
    if type_num == 1:
        dict_data['Type of Statement'] = length * ['Consolidated']
    else:
        dict_data['Type of Statement'] = length * ['Standalone']

    # Adding format of profit and loss statement to dict_data
    if format_num == 1:
        dict_data['Format'] = length * ['New']
    else:
        dict_data['Format'] = length * ['Old']
    
    return dict_data

list_of_dict = []
# Create a dictionary of tuples for Nifty 50 companies
company_info = ({'Tata Motors' : ('tatamotors', 'tm03'), 'Adani Ports' : ('adaniportsandspecialeconomiczone', 'MPS'),
'Sun Phrama' : ('sunpharmaceuticalindustries', 'SPI'), 'Cipla' : ('cipla', 'C'), 'Grasim Industries' : ('grasimindustries', 'gi01'),
'JSW Steel' : ('jswsteel', 'jsw01')})
type_num = 1
format_num = 1
for i, j in company_info.items():
    count = 0
    while True:
        dic = profit_and_loss(j, type_num, format_num, count)
        if dic == 'Webpage not valid':
            break
        list_of_dict.append(dic)
        count = count + 1

# Combining all dictionaries
master_dict = {}
for i in list_of_dict:
    for k, v in i.items():
        if k not in master_dict:
            master_dict[k] = v
        else:
            temp_list = master_dict[k]
            temp_list.extend(v)
            master_dict[k] = temp_list
            
df = pd.DataFrame.from_dict(master_dict)

In [24]:
df

Unnamed: 0,Profit & Loss Statement of Year,INCOME,Revenue From Operations [Gross],Less: Excise/Sevice Tax/Other Levies,Revenue From Operations [Net],Total Operating Revenues,Other Income,Total Revenue,EXPENSES,Cost Of Materials Consumed,...,OTHER ADDITIONAL INFORMATION,EARNINGS PER SHARE,Basic EPS (Rs.),Diluted EPS (Rs.),DIVIDEND AND DIVIDEND PERCENTAGE,Equity Share Dividend,Tax On Dividend,Company,Type of Statement,Format
0,Mar 20,,258594.36,0.00,258594.36,261067.97,2973.15,264041.12,,152671.47,...,,,-35.00,-35.00,,0.00,0.00,tatamotors,Consolidated,New
1,Mar 19,,299190.59,0.00,299190.59,301938.40,2965.31,304903.71,,181009.08,...,,,-85.00,-85.00,,0.00,0.00,tatamotors,Consolidated,New
2,Mar 18,,289386.25,790.16,288596.09,294619.18,888.89,295508.07,,171992.59,...,,,26.00,26.00,,0.00,0.00,tatamotors,Consolidated,New
3,Mar 17,,270298.08,4799.61,265498.47,269692.51,754.54,270447.05,,159369.55,...,,,22.00,22.00,,73.00,0.00,tatamotors,Consolidated,New
4,Mar 16,,274175.10,4614.99,269560.11,273045.60,885.35,273930.95,,153292.49,...,,,34.00,34.00,,0.00,0.00,tatamotors,Consolidated,New
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,Mar 11,,25829.13,1967.56,23861.57,24105.89,189.97,24295.86,,15162.75,...,,,85.00,84.00,,273.32,48.87,jswsteel,Consolidated,New
89,Mar 10,,20211.33,1289.18,18922.15,18922.15,443.63,19369.54,,13173.81,...,,,64.00,64.00,,177.70,34.31,jswsteel,Consolidated,New
90,Mar 09,,17112.88,1172.70,15940.18,15940.18,158.76,16098.94,,11492.42,...,,,11.00,11.00,,18.71,8.11,jswsteel,Consolidated,New
91,Mar 08,,13665.56,1237.86,12427.70,12427.70,312.04,12739.74,,7872.56,...,,,66.00,66.00,,261.87,49.53,jswsteel,Consolidated,New


In [1]:
import bs4
import requests
import pandas as pd

res = requests.get('https://www.moneycontrol.com/stocks/marketstats/indexcomp.php?optex=NSE&opttopic=indexcomp&index=9', timeout = 2,
headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 GTB7.1 (.NET CLR 3.5.30729)", "Referer": "http://example.com"})

# Check if the webpage has been correctly parsed
try:
    res.raise_for_status()
except:
    print('Error in parsing the webpage')
soup = bs4.BeautifulSoup(res.content, 'html.parser')

nifty50_table = soup.find(class_ = 'tbldata14 bdrtpg')
nifty_table_list = nifty50_table.find_all('tr')

company_details = []
for i in nifty_table_list[1:]:
    company_details.append(i.find_all('b'))

company_details = [(i[0].text, i[1].text) for i in company_details]

# Extracting company codes from href tag in the list (these codes will be used to prepare URLs while parsing financial statements)
url_ref = []
for i in nifty_table_list[1:]:
    url_ref.append(i.a['href'])
codes_list = []
for i in url_ref:
    codes_list.append((i.split('/')[-2], i.split('/')[-1]))


# Master list is a list of tuples (Company name, Company sector, long code, short code)
# Long code and short code will be used to create URL address while extracting financial data of the company
master_list = []
for i, j in zip(company_details, codes_list):
    master_list.append(i + j)

columns_list = ['Company Name', 'Company Sector', 'Long Code', 'Short Code']
df = pd.DataFrame(master_list, columns = columns_list)

Webpage 1:
https://www.moneycontrol.com/financials/larsen&toubro/consolidated-profit-lossVI/LT#LT
Webpage 2:
https://www.moneycontrol.com/financials/larsen&toubro/consolidated-profit-lossVI/LT/2#LT
Webpage 3:
https://www.moneycontrol.com/financials/larsen&toubro/consolidated-profit-lossVI/LT/3#LT

In [2]:
df

Unnamed: 0,Company Name,Company Sector,Long Code,Short Code
0,Adani Ports,Transport Infrastructure,adaniportsspecialeconomiczone,MPS
1,Asian Paints,Paints,asianpaints,AP31
2,Axis Bank,Bank - Private,axisbank,AB16
3,Bajaj Auto,Automobile - 2 & 3 Wheelers,bajajauto,BA10
4,Bajaj Finance,Finance - NBFC,bajajfinance,BAF
5,Bajaj Finserv,Finance - Investment,bajajfinserv,BF04
6,Bharti Airtel,Telecommunication - Service Provider,bhartiairtel,BA08
7,Bharti Infratel,Telecommunication - Equipment,bhartiinfratel,BI14
8,BPCL,Refineries,bharatpetroleumcorporation,BPC
9,Britannia,Consumer Food,britanniaindustries,BI
