In [1]:
%%time
# Underlying details extracted from nse
# Program takes 5 mins

# STATUS: Completed

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bs4
import json
import datetime as datetime
import sys

#*****            Initializations           ****
#_______________________________________________

load_from_disk = True   # For underlying symbol list

#******         Paths and variables         ****
#_______________________________________________

datapath = r'./zdata/'


#******   Error catch in list comprehension  ****
#________________________________________________

def catch(func, handle=lambda e : e, *args, **kwargs):
    '''List comprehension error catcher'''
    try:
        return func(*args, **kwargs)
    except Exception as e:
        pass

#******               Symbols list          ****
#________________________________________________

if load_from_disk:
    try:
        symbols = list(pd.read_csv(datapath+'nse_eq_symbols.csv'))
    except FileNotFoundError as e:
        print('csv file not found')
        sys.exit(0)

#******     Underlying data extraction     *****
#_______________________________________________

def get_underlying(symbol):
    
    '''Gets the underlying data
    Arg: (symbol as string)
    Returns: df as dataframe'''

    # URLs
    url_base1 = "https://www.nseindia.com/live_market/dynaContent/live_watch/"
    url = url_base1 + "get_quote/GetQuote.jsp?symbol="+symbol

    page = requests.get(url).text

    # soup out the json dict
    bs_nse = bs4(page, 'html.parser')
    json_nse = json.loads(bs_nse.find(id='responseDiv').text.strip())

    df = pd.DataFrame.from_dict(json_nse['data'][0], orient='index').T
    
    return df

nse_underlyings = [catch(lambda: get_underlying(symbol)) for symbol in symbols]

# indexed by symbols
df_nse_und = pd.concat(nse_underlyings).set_index('symbol')

# clean up the commas, dashes and empty strs
df_nse_und1 = df_nse_und.replace(',|-' , '', regex=True)
df_nse_und1 = df_nse_und1.replace('', np.nan)

# convert date columns to datetime.date format
filtin = ['Date', 'dt']
filtout = ['isExDateFlag']
date_columns_mask = df_nse_und1.columns.str.contains('|'.join(filtin)) | (df_nse_und1.columns.str.contains('|'.join(filtout)))
dt_col_list = df_nse_und1.loc[:, date_columns_mask].columns.tolist()
dt_col_list = [item for item in dt_col_list if item not in filtout]  #filterout

df_nse_und1.loc[:, dt_col_list] = df_nse_und1.loc [:, dt_col_list].apply( \
                                  pd.to_datetime, errors= 'coerce').applymap(pd.Timestamp.date)

# store data for next program
df_nse_und1.to_pickle(datapath+r'df_underlying.pkl')

Wall time: 5min


In [2]:
df_nse_und1.head()

Unnamed: 0_level_0,pricebandupper,applicableMargin,bcEndDate,totalSellQuantity,adhocMargin,companyName,marketType,exDate,bcStartDate,css_status_desc,...,buyQuantity3,buyQuantity2,buyQuantity1,series,faceValue,buyQuantity5,closePrice,open,isinCode,lastPrice
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BPCL,354.0,19.93,2018-09-11,8945.0,,Bharat Petroleum Corporation Limited,N,2018-08-31,2018-09-04,Listed,...,,,,EQ,10.0,,322.05,319.6,INE029A01011,319.5
EICHERMOT,26339.85,15.8,2018-08-10,,,Eicher Motors Limited,N,2018-08-02,2018-08-04,Listed,...,,,222.0,EQ,10.0,,24735.0,24120.0,INE066A01013,24796.1
OFSS,3829.55,12.5,2018-08-14,,,Oracle Financial Services Software Limited,N,2018-08-06,2018-08-08,Listed,...,,,156.0,EQ,5.0,,3573.45,3494.9,INE881D01027,3580.0
CESC,751.85,13.68,0001-01-01,347.0,,CESC Limited,N,2018-10-30,0001-01-01,Listed,...,,,,EQ,10.0,,687.05,683.9,INE486A01013,688.0
HCLTECH,1087.6,13.8,0001-01-01,17.0,,HCL Technologies Limited,N,2018-10-30,0001-01-01,Listed,...,,,,EQ,2.0,,1022.2,997.0,INE860A01027,1022.0
