## Project Description
##### Understanding the global startup ecosystem by analyzing the rise of Unicorn companies around the world. Unicorns are startups with a valuation of more than USD 1 billion.

In [22]:
# Importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [23]:
df = pd.read_html('https://www.cbinsights.com/research-unicorn-companies', attrs = {'class': 'sortable-theme-bootstrap'})[0]
df.head()

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors
0,ByteDance,$140,4/7/2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,$127,12/1/2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,$100,7/3/2018,China,Shenzhen,E-commerce & direct-to-consumer,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,$95,1/23/2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG"
4,Canva,$40,1/8/2018,Australia,Surry Hills,Internet software & services,"Sequoia Capital China, Blackbird Ventures, Mat..."


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1175 entries, 0 to 1174
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Company           1175 non-null   object
 1   Valuation ($B)    1175 non-null   object
 2   Date Joined       1175 non-null   object
 3   Country           1175 non-null   object
 4   City              1158 non-null   object
 5   Industry          1175 non-null   object
 6   Select Investors  1174 non-null   object
dtypes: object(7)
memory usage: 32.2+ KB


In [25]:
# Renaming columns
df.rename({'Company': 'company', 'Valuation ($B)': 'valuation', 'Date Joined': 'join_date',
           'Country': 'country', 'City': 'city', 'Industry': 'industry', 'Select Investors': 'investors'}, axis=1, inplace=True)

In [26]:
# Reformatting column - industry
df['industry'].replace({'Internet software & services': 'Software', 'E-commerce & direct-to-consumer': 'E-commerce',
                        'Artificial intelligence': 'AI', 'Supply chain, logistics, & delivery': 'Logistics',
                        'Data management & analytics': 'Analytics', 'Auto & transportation': 'Transportation',
                        'Mobile & telecommunications': 'Telecom', 'Consumer & retail': 'Retail', 'Artificial Intelligence': 'AI'}, inplace=True)

df['industry'].value_counts()

Fintech           242
Software          224
E-commerce        106
Health             93
AI                 87
Logistics          66
Other              65
Cybersecurity      58
Analytics          45
Transportation     39
Hardware           38
Telecom            38
Edtech             32
Retail             28
Travel             14
Name: industry, dtype: int64

In [27]:
# Reformatting column - investors
df[['investor_1', 'investor_2', 'investor_3', 'investor_4']] = df['investors'].str.split(',', expand=True)
df.drop('investors', axis=1, inplace=True)

In [28]:
# Cleaning column data
df['valuation'] = df['valuation'].str.strip('$')  # removing dollar sign
df['valuation'] = df['valuation'].str.extract(r'([0-9]+.?[0-9]*)')  # extracting the floating point number using regex

for col in df.columns:
    df[col] = df[col].str.strip()  # removing leading and trailing whitespaces(if any) from all columns

In [29]:
# Changing column data types
df = df.astype({'valuation': 'float64', 'join_date': 'datetime64[ns]', 'country': 'category', 'industry': 'category'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1175 entries, 0 to 1174
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   company     1175 non-null   object        
 1   valuation   1175 non-null   float64       
 2   join_date   1175 non-null   datetime64[ns]
 3   country     1175 non-null   category      
 4   city        1158 non-null   object        
 5   industry    1175 non-null   category      
 6   investor_1  1174 non-null   object        
 7   investor_2  1124 non-null   object        
 8   investor_3  1035 non-null   object        
 9   investor_4  9 non-null      object        
dtypes: category(2), datetime64[ns](1), float64(1), object(6)
memory usage: 49.3+ KB
