In [206]:
#Importing necessary libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [17]:
#Retrieving the website
url = "https://www.zaubacorp.com/company/YNOS-VENTURE-ENGINE-CC-PRIVATE-LIMITED/U74999TN2017PTC115985"
#url = "https://www.zaubacorp.com/company/BUSINESSONBOT-PRIVATE-LIMITED/U72900KA2020PTC136387"
source = requests.get(url)

In [18]:
if source.status_code == 200:
    print("Website successfully retrieved!!!")
else:
    print("Unable to retrieve the website")

Website successfully retrieved!!!


In [19]:
#Parsing the website data
soup = BeautifulSoup(source.text, "lxml" )

In [207]:
#Section containing the company data
container_div = soup.find(name = "div", class_ = "container information")

## Description text about the company

In [216]:
#Function to obtain the description text
def desription(container_div):
    """
        Input: 
            container_div: content inside the div tag with class container information
        Output:
            des_text: summary of the company on Zubacorp
    """
    content_list = container_div.contents #list of the entire content split in chunks
    description = []
    for item in content_list: #Looping over all the contents
        if len(item) > 3: #Skipping small tags and escape sequences
            boolean = str(item).startswith("<div") #extracting content that does not start with the <div> tag
            if boolean == False: 
                description.append(item.strip())
            else:    
                break #Since all the description text is before any <div> tag

    des_text = " ".join(description) #List to string

    return des_text

In [221]:
#Storing the description in des_text variable
des_text = desription(container_div) 
print(des_text)

Ynos Venture Engine Cc Private Limited is a Private incorporated on 10 April 2017. It is classified as Non-govt company and is registered at Registrar of Companies, Chennai. Its authorized share capital is Rs. 1,500,000 and its paid up capital is Rs. 1,103,220. It is inolved in Business activities n.e.c. Ynos Venture Engine Cc Private Limited's Annual General Meeting (AGM) was last held on 10 October 2020 and as per records from Ministry of Corporate Affairs (MCA), its balance sheet was last filed on 31 March 2020. Directors of Ynos Venture Engine Cc Private Limited are Muthuraman, Hari Krishnan Kannappan, Annamalai Rajan Thillai, Pradeep Venkataramu Kumar and Viswanathan Sangeetha. Ynos Venture Engine Cc Private Limited's Corporate Identification Number is (CIN) U74999TN2017PTC115985 and its registration number is 115985.Its Email address is thillair@iitm.ac.in and its registered address is IIT Madras Research Park, Taramani Chennai Chennai TN 600113 IN ,  - ,  . Current status of Yno

## Obtaining Tabular data

In [222]:
#List with indexes of table containing useful information
tables = div.find_all("table", class_ = "table table-striped")

In [8]:
# #Function to retrieve the indexes of tables with useful information
# def return_indexes(tables):
#     table_indexes = []
#     for index,table in enumerate(tables):
#         num_of_rows = len(table.find_all("tr"))
#         num_of_cols = len(table.find_all("td"))
#         if num_of_rows > 2 and num_of_cols > 2 and index <= 7:
#             max_len = 0
#             for row in table.find_all("tr"):
#                 text = row.text.split("\n")
#                 text = [i.strip() for i in text if len(i.strip()) > 0]
#                 if len(text) > max_len:
#                     max_len = len(text)
#             if max_len > 1:
#                 table_indexes.append(index)
#     return table_indexes

In [11]:
# table_indexes = return_indexes(tables)
# table_indexes
# tables = [table for index, table in enumerate(tables) if index in table_indexes]

## Obtaining company details

In [229]:
#function to return a dataframe with all the company details
def company_details(tables):
    """
        Input: 
            tables: tables inside the div tag with class container information
        Output:
            dataframe containing the information about the company from the table present on the website in div tag
    """
    company_dict = {"Particulars": [], "Value": []} #dictionay to store particulars in
    for index, table in enumerate(tables): #looping over all the tables
        if index in [0,3,4]: #Only retrieving values stored in table [0,3,4] as remaining are not pertaining to the req.
            table_details = table.find_all("tr")
            for row in table_details:
                text = row.text.split('\n')
                text = [i for i in text if len(i.strip()) > 0]
                company_dict["Particulars"].append(text[0])
                company_dict["Value"].append(text[1])
                
    return pd.DataFrame(company_dict) 

In [248]:
company_df = company_details(tables)
company_df

Unnamed: 0,Particulars,Value
0,CIN,U74999TN2017PTC115985
1,Company Name,YNOS VENTURE ENGINE CC PRIVATE LIMITED
2,Company Status,Active
3,RoC,RoC-Chennai
4,Registration Number,115985
5,Company Category,Company limited by Shares
6,Company Sub Category,Non-govt company
7,Class of Company,Private
8,Date of Incorporation,10 April 2017
9,Age of Company,"3 years, 11 month, 27 days"


### Adding address and email id to the dataframe

In [233]:
div_2 = soup.find_all("div", class_ = "col-12") #div containing the contact details
contact = div_2[0]
#Cleaning the text and obtaining a list
contact_list = [i.strip() for i in contact.text.split("\n") if len(i.strip()) > 0]
contact_list = contact_list[1:]
print(contact_list)

['Email ID: thillair@iitm.ac.in Website: Click here   to add.', 'Address: IIT Madras Research Park, Taramani Chennai Chennai TN 600113 IN']


In [255]:
#Email ID and address
patterns = [r"(Email ID)(\W*)([a-zA-Z0-9+_.-]+@[a-zA-Z0-9.-]+)",r"(Address)(\W*)([\w\W]+)"] #regex patterns for email as well as address
keys = company_df.columns
for i, pattern in zip(contact_list, patterns): #Loop for the two pieces of information
    temp_dic = {}
    matches = re.search(pattern, i) #returns groups of searched patterns wherever match is found
    temp_dic[keys[0]] = matches.group(1) #particular
    temp_dic[keys[1]] = matches.group(3) #value
    company_df = pd.concat([company_df, pd.DataFrame(temp_dic, index = range(1))]) #adding the info to the main company dataframe

In [256]:
company_df

Unnamed: 0,Particulars,Value
0,CIN,U74999TN2017PTC115985
1,Company Name,YNOS VENTURE ENGINE CC PRIVATE LIMITED
2,Company Status,Active
3,RoC,RoC-Chennai
4,Registration Number,115985
5,Company Category,Company limited by Shares
6,Company Sub Category,Non-govt company
7,Class of Company,Private
8,Date of Incorporation,10 April 2017
9,Age of Company,"3 years, 11 month, 27 days"


## Board Members

In [258]:
#function to return the list of board members 
def board_details(table):
    """
        Input:
            table: table containing the information about the members
        Output:
            dataframe containing the details
    """
    directors = {}
    flag = False
    table_details = table.find_all("tr")
    for row in table_details:
        text = row.text.split('\n')
        text = [i for i in text if len(i.strip()) > 0]
        size = len(text)
        #Below piece of code helps obtain only the lists containing the director details
        if flag == False: 
            if size == 4:
                for item in text:
                    directors[item] = []
            flag = True
        if size == 5:
            for index, key in enumerate(directors.keys()):
                if index < 4:
                    directors[key].append(text[index])
                    
    return pd.DataFrame(directors) #return

In [259]:
board_df = board_details(tables[7]) #Table at index 7 contains the director related information
board_df

Unnamed: 0,DIN,Director Name,Designation,Appointment Date
0,2375046,MUTHURAMAN,Director,05 February 2018
1,2369394,HARI KRISHNAN KANNAPPAN,Director,27 September 2018
2,3559031,ANNAMALAI RAJAN THILLAI,Director,10 April 2017
3,7765457,PRADEEP VENKATARAMU KUMAR,Director,10 April 2017
4,7807397,VISWANATHAN SANGEETHA,Director,05 February 2018


## Companies nearby

In [260]:
companies_at_same_add = div.find_all("table", id = "results")[0]
details = companies_at_same_add.find_all("tr")

In [261]:
def companies_nearby(table):
    """
        Input: 
            table with information about all the companies at same address
        Output: 
    """
    temp = []
    for row in table:
        text = row.text.split("\n")
        text = [i for i in text if len(i.strip()) > 0]
        temp.append(text)
    df = pd.DataFrame(temp)
    df.rename(columns = df.loc[0,:], inplace=True)
    df.drop(index = 0, inplace = True)
    return df

In [262]:
companies_df = companies_nearby(details)
companies_df

Unnamed: 0,CIN,Name,Address
1,U74999TN2017PTC115985,YNOS VENTURE ENGINE CC PRIVATE LIMITED,"IIT Madras Research Park, Taramani Chennai Che..."
2,U72900TN2017PTC119928,GNANAM INSTITUTE FOR TRAINING IN ADVANCED ANAL...,"IIT Madras Research Park, Kanagam Road Taraman..."
3,U74900TN2012PTC084147,OKAPI ADVISORY SERVICES PRIVATE LIMITED,"01 FA, IIT Madras Research Park, Kanagam Road,..."
4,U74900TN2014PTC096794,HELYXON HEALTHCARE SOLUTIONS PRIVATE LIMITED,"IIT MADRAS RESEARCH PARK NO. 1, KANAGAM ROAD T..."
5,U74900TN2016PTC104497,OZONE MOTORS PRIVATE LIMITED,01 FA FIRST FLOOR IIT MADRAS RESEARCH PARK KAN...
6,U74999TN2017PTC118695,VAYUJAL TECHNOLOGIES PRIVATE LIMITED,"IIT MADRAS RESEARCH PARK, 01FA, FIRST FLOOR,KA..."
7,U74999TN2017PTC119307,ENVITRAN SMART SYSTEMS PRIVATE LIMITED,"IIT Madras Research Park 01FA, I Floor, Kanaga..."
8,U74999TN2018PTC126629,X2FUELS AND ENERGY PRIVATE LIMITED,"IIT Madras Research Park, 01 FA, I Floor Kanag..."
9,U74999TN2019PTC128905,DHVANI ANALYTIC INTELLIGENCE PRIVATE LIMITED,"01 FA, FIRST FLOOR, IIT MADRAS RESEARCH PARK K..."


### Details about directors' involvement in other companies

In [269]:
#Function to obtain the directors details
def directors_details(tables, board_df):
    """
        Input:
            tables: all the tables in the div element extracted above
            board_df: details about the directors in this company
        Output:
            directors_df: dataframe with all the companies each director is a part of except this particular company
    """
    flag = False
    directors_df = pd.DataFrame()
    for index, table in enumerate(tables[8:18:2]): #tables at these indexes only are relevant to us here
        rows = table.find_all("tr")
        temp = [] #List to store all the rows after cleaning
        for row in rows: #loop over the rows in each table
            text = row.text.split("\n")
            text = [i for i in text if len(i.strip()) > 0]
            temp.append(text)

        if flag == False: #Condition so to format the dataframe properly for further addition of information
            df = pd.DataFrame(temp)
            df.rename(columns = df.iloc[0,:], inplace = True) #assigning the first row values as column names
            df.drop(index = 0, inplace= True) #Dropping the row that contains headers
            flag = True
            df.loc[:,"Director Name"] = board_df.loc[index, "Director Name"] #Adding director's name
            directors_df =  df
        else:
            df = pd.DataFrame(temp)
            df.rename(columns = df.iloc[0,:], inplace = True) #assigning the first row values as column names
            df.drop(index = 0, inplace= True) #Dropping the row that contains headers
            df.loc[:,"Director Name"] = board_df.loc[index, "Director Name"] #Adding director's name
            directors_df = pd.concat([directors_df, df]) #stacking the director_df above the new df
    directors_df.reset_index(drop =True, inplace =True) #reseting indexes
    directors_df["Company"].replace(to_replace = "[\w\W]+not associated[\w\W]+", value = "None", regex = True, inplace = True) #Replacing the entry with no company names with non
    
    return directors_df

In [270]:
directors_df = directors_details(tables, board_df)
directors_df

Unnamed: 0,Company,Designation,Appointment Date,Director Name
0,CENTURYWELLS ROOFING INDIA PRIVATELIMITED,Director,27 January 2017,MUTHURAMAN
1,RESILEO LABS LLP,Individual Partner,08 September 2016,MUTHURAMAN
2,VISHAL PRECISION STEEL TUBES AND STRIPSPRIVATE...,Director,27 January 2017,MUTHURAMAN
3,TAURUS VALUE STEEL & PIPES PRIVATELIMITED,Director,27 January 2017,MUTHURAMAN
4,TATTVA FINCORP LIMITED,Director,07 September 2018,MUTHURAMAN
5,RIVERBRIDGE INVESTMENT ADVISORS PRIVATELIMITED,Director,27 November 2008,MUTHURAMAN
6,KOVAI MEDIA PRIVATE LIMITED,Director,05 November 2014,MUTHURAMAN
7,DEVASTHANAM PRANA FOUNDATION,Director,22 January 2021,MUTHURAMAN
8,KICKSTARTUP ADVISORY SERVICES LLP,Body Corporate DP Nominee,08 January 2019,HARI KRISHNAN KANNAPPAN
9,IDRIVE INFOSERVICES PRIVATE LIMITED,Additional Director,05 December 2018,HARI KRISHNAN KANNAPPAN
