In [1]:
import requests #A library to scrap the website 
from bs4 import BeautifulSoup #A library to extract the tags and extract useful information from the HTML data
import pandas as pd #A useful library for data analysis

In [2]:
website_url = "https://en.wikipedia.org/wiki/List_of_unicorn_startup_companies" #Initialising the url from which we will extract the information
website_HTML = requests.get(website_url).text #HTML fetched data
soup = BeautifulSoup(website_HTML, 'lxml') #Convert HTML data into soup object so that we can use in-built functions to extract information
valuationTable = soup.find_all('table', {'class':'wikitable sortable'}) #Define which tags you need to extract from HTML data and what is the name of the class (in case you have many tables and here by giving class you can distingush the data)
unicornTable = valuationTable[0] #Wiki Page has 2 tables with same class and first table consist of all startuos that are presently unicorn
pastUnicornTable = valuationTable[1] #Second table has information of all the startups which were previously unicorn and are now in exited state

In [3]:
#here I am defining a function in python to get pandas dataframe from extracted HTML information
def findDFfromTable(table):
    rows = table.findAll('tr') #Finding all the rows of the table <tr> is the tag that we used to define rows in HTML table
    parsed_table_data = []
    for row in rows: #Loop to clean the data and store in a 2D list data structure
        children = row.findChildren(recursive=False)
        row_text = []
        for child in children:
            clean_text = child.text
            #This is to discard reference/citation links
            clean_text = clean_text.split('&#91;')[0]
            #This is to clean the header row of the sort icons
            clean_text = clean_text.split('&#160;')[-1]
            clean_text = clean_text.strip()
            row_text.append(clean_text)
        parsed_table_data.append(row_text)
    
    df = pd.DataFrame.from_records(parsed_table_data)
    return df

In [4]:
unicornDF = findDFfromTable(unicornTable) #Converting HTML Information into Pandas Dataframe: This will simplify our data analysis

In [5]:
unicornDF.head() # Returning top 5 rows from the dataframe and first row shows the name of columns

Unnamed: 0,0,1,2,3
0,Company,Valuation (US $billion),Valuation date,Country/Region
1,Ant Financial,150,April 2018[10],China
2,Bytedance,78.0,November 2018[11],China
3,DiDi,51.6,December 2018[12],China
4,Airbnb,38.0,March 2018[13],US


In [6]:
unicornDF.columns = ['Company', 'Valuation (US $billion)', 'Valuation Date', 'Country'] #Setting the name of the columns
unicornDF = unicornDF.drop(0) #Dropping the first row from table
unicornDF.head()

Unnamed: 0,Company,Valuation (US $billion),Valuation Date,Country
1,Ant Financial,150.0,April 2018[10],China
2,Bytedance,78.0,November 2018[11],China
3,DiDi,51.6,December 2018[12],China
4,Airbnb,38.0,March 2018[13],US
5,Stripe,35.0,September 2019[14],US


In [7]:
unicornDF.info() #This will return the datatype information of all the columns 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 345 entries, 1 to 345
Data columns (total 4 columns):
Company                    345 non-null object
Valuation (US $billion)    345 non-null object
Valuation Date             345 non-null object
Country                    345 non-null object
dtypes: object(4)
memory usage: 13.5+ KB


In [8]:
unicornDF.shape #return the number of rows V/s Number of Columns

(345, 4)

In [9]:
unicornDF['Valuation (US $billion)'].unique() #Return all the unique valuations 

array(['150', '78.0', '51.6', '38.0', '35.0', '33.3', '30.0', '24.0',
       '20.0', '18.0', '15.0', '13.0', '11.0+', '10.77', '10.0+', '10.0',
       '9.23', '9.0', '8.8', '8.0', '7.6', '7.3', '7.1', '7.0', '7',
       '6.5', '6.4', '6.2', '5.5', '5.4', '5.0+', '5.0', '4.5', '4.4',
       '4.3', '4.2', '4.05', '4.0+', '4.0', '3.95', '3.8', '3.7', '3.65',
       '3.55', '3.5', '3.3', '3.25', '3.2', '3.17', '3+', '3.0', '2.9',
       '2.8', '2.775', '2.75', '2.7', '2.65', '2.6', '2.55', '2.5', '2.4',
       '2.39', '2.32', '2.19', '2.2', '2.1', '2.09', '2.0+', '2.0',
       '2.0-', '1.98', '1.95', '1.94', '1.93', '1.9', '1.89', '1.85',
       '1.82', '1.8', '1.78', '1.75', '1.7', '1.6', '1.5+', '1.5', '1.47',
       '1.46', '1.45', '1.4', '1.33', '1.32', '1.3', '1.28', '1.27',
       '1.26', '1.25', '1.2', '1.16', '1.13', '1.1', '1.05', '1.095', '1',
       '1.04', '1+', '1.0', '5+'], dtype=object)

In [10]:
unicornDF['Valuation (US $billion)'] = unicornDF['Valuation (US $billion)'].map(lambda x: x.rstrip('+-')) #Convert Valuation Column into float and this step will clean all the unnecessary signs like + or - as we can see in the above row

In [11]:
unicornDF['Valuation (US $billion)'] = unicornDF['Valuation (US $billion)'].astype(float) #Convert data into float

In [12]:
unicornDF.info() #Please compare the Valuation (US (#billion)) from the above row

<class 'pandas.core.frame.DataFrame'>
Int64Index: 345 entries, 1 to 345
Data columns (total 4 columns):
Company                    345 non-null object
Valuation (US $billion)    345 non-null float64
Valuation Date             345 non-null object
Country                    345 non-null object
dtypes: float64(1), object(3)
memory usage: 13.5+ KB


In [13]:
unicornDF.count()

Company                    345
Valuation (US $billion)    345
Valuation Date             345
Country                    345
dtype: int64

In [14]:
#Sort the dataframe with respect to Valuation in descending order and find new dataframe from top 30 Valued
top30Valued = unicornDF.sort_values('Valuation (US $billion)', ascending = False).head(30)
top30Valued

Unnamed: 0,Company,Valuation (US $billion),Valuation Date,Country
1,Ant Financial,150.0,April 2018[10],China
2,Bytedance,78.0,November 2018[11],China
3,DiDi,51.6,December 2018[12],China
4,Airbnb,38.0,March 2018[13],US
5,Stripe,35.0,September 2019[14],US
6,SpaceX,33.3,May 2019[15],US
7,Lufax,30.0,June 2018[16],China
8,JUUL Labs,24.0,October 2019[17],US
9,Cainiao,20.0,September 2017[16],China
10,Palantir Technologies,20.0,November 2018[18],US


In [15]:
#Now the data analysis part starts
#This will countrywise count of unicorns in top30 unicorns
#Basically The data tells us that there are 12 Chinese Unicorns in top 30 Unicorns of world
#11 Unicorns of US in top 30 Unicorns of World
#2 Unicorns of India in top 30 unicorns of World
top30Valued['Country'].value_counts()

China          12
US             11
India           2
Indonesia       1
Brazil          1
Singapore       1
South Korea     1
Canada          1
Name: Country, dtype: int64

# Now we want to know which 2 unicorn startups of India are in this list

In [16]:
#This will return the name of the companies which are Indian Unicorns in top 30 Unicorns of the world. 
top30Valued[top30Valued['Country']=="India"]

Unnamed: 0,Company,Valuation (US $billion),Valuation Date,Country
12,Paytm,18.0,March 2019[19],India
19,OYO,10.0,June 2019[26],India


# Surprised Flipkart is not in the list: You will come to know later 

# Now we want a list of all the countries Where Unicorns existed

In [17]:
uniqueCountry = unicornDF['Country'].unique()
uniqueCountry.sort()
print("Total Number of Countries", len(uniqueCountry))
for country in uniqueCountry:
    print(country)

Total Number of Countries 37
Argentina
Australia
Belgium
Belgium /  Netherlands
Brazil
Canada
Chile
China
Colombia
Estonia
France
Germany
Hong Kong
India
India /  US
India/ US
Indonesia
Israel
Japan
Lithuania
Luxembourg
Portugal
Portugal /  UK
Russia
Singapore
Singapore /  Israel
Singapore /   Switzerland
South Korea
Spain
Sweden
Switzerland
UAE
UK
US
US /  Israel
US /  Romania
Ukraine


## Let's find some more information


1. Sum of valuation of all unicorns
2. Percentage of Money in top 30 unicorns compared to all unicorns
3. Highest Valued Company
4. Number of Multi-Billion Startups (Valuation > $1B)

In [18]:
print("The total valuation of all the unicorns is", unicornDF['Valuation (US $billion)'].sum(), "Billion Dollars")

The total valuation of all the unicorns is 1297.01 Billion Dollars


In [19]:
print("The % valuation money of top 30 startups", top30Valued['Valuation (US $billion)'].sum()*100/unicornDF['Valuation (US $billion)'].sum(), "%" )

The % valuation money of top 30 startups 53.453712770140555 %


In [20]:
print("Highest Valued Company is ", top30Valued['Company'][1])

Highest Valued Company is  Ant Financial


In [21]:
print("Number of Multi-Billion Startups (Valuation > $1B)", unicornDF[unicornDF['Valuation (US $billion)']>1].count())

Number of Multi-Billion Startups (Valuation > $1B) Company                    237
Valuation (US $billion)    237
Valuation Date             237
Country                    237
dtype: int64


In [22]:
multiBillion = unicornDF[unicornDF['Valuation (US $billion)']>1]

In [23]:
multiBillion['Country'].value_counts() #Country-wise list of multi-billion startups

US                     88
China                  80
India                  15
South Korea            10
UK                     10
Indonesia               5
Singapore               2
Sweden                  2
Switzerland             2
Germany                 2
France                  2
Israel                  2
Singapore /  Israel     1
Brazil                  1
Spain                   1
Belgium                 1
US /  Israel            1
Chile                   1
Portugal /  UK          1
Canada                  1
Colombia                1
Australia               1
US /  Romania           1
India /  US             1
Russia                  1
Ukraine                 1
Luxembourg              1
Japan                   1
Lithuania               1
Name: Country, dtype: int64

In [24]:
unicornDF['Valuation (US $billion)'].median()

1.5

In [25]:
multiBillion['Valuation (US $billion)'].mode()

0    2.0
dtype: float64