# Data Visualisation - Unicorn Companies

## Importing Libraries

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("/Users/roshanchandru/Desktop/DataVisulaisation_project/Unicorn_Companies.csv")
df.head(3)

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country,Continent,Year Founded,Funding,Select Investors
0,Bytedance,$180B,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,$100B,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,$100B,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China..."


In [3]:
df.dtypes

Company             object
Valuation           object
Date Joined         object
Industry            object
City                object
Country             object
Continent           object
Year Founded         int64
Funding             object
Select Investors    object
dtype: object

## Details of the dataset:
- Company (object): The name of the company.
- Valuation (object): The valuation of the company in USD.
- Date Joined (object): The date the company joined the unicorn list.
- Industry (object): The industry sector the company operates in.
- City (object): The city where the company is headquartered.
- Country (object): The country where the company is located.
- Continent (object): The continent where the company is located.
- Year Founded (int64): The year the company was founded.
- Funding (object): The total funding received by the company in USD.
- Select Investors (object): Key investors in the company.

In [4]:
df.shape

(1074, 10)

In [5]:
df.describe()

Unnamed: 0,Year Founded
count,1074.0
mean,2012.895717
std,5.698573
min,1919.0
25%,2011.0
50%,2014.0
75%,2016.0
max,2021.0


## Data Cleaning

In [6]:
df.isnull().values.sum()

17

In [7]:
df.duplicated().head()

0    False
1    False
2    False
3    False
4    False
dtype: bool

### Impute missing values for 'City' and 'Select Investors' with placeholders

In [8]:
df['City'].fillna('Unknown', inplace=True)
df['Select Investors'].fillna('Not Available', inplace=True)

### Convert 'Date Joined' to datetime format

In [9]:
df['Date Joined'] = pd.to_datetime(df['Date Joined'])

### Identify and remove duplicate rows

In [10]:
data_before = len(df)
df.drop_duplicates(inplace=True)
data_after = len(df)

### Display the first few rows of the cleaned dataset and the change in dataset size due to duplicate removal

In [11]:
print(df.head())
print(f'Duplicates Removed: {data_before - data_after}')

     Company Valuation Date Joined                         Industry  \
0  Bytedance     $180B  2017-04-07          Artificial intelligence   
1     SpaceX     $100B  2012-12-01                            Other   
2      SHEIN     $100B  2018-07-03  E-commerce & direct-to-consumer   
3     Stripe      $95B  2014-01-23                          Fintech   
4     Klarna      $46B  2011-12-12                          Fintech   

            City        Country      Continent  Year Founded Funding  \
0        Beijing          China           Asia          2012     $8B   
1      Hawthorne  United States  North America          2002     $7B   
2       Shenzhen          China           Asia          2008     $2B   
3  San Francisco  United States  North America          2010     $2B   
4      Stockholm         Sweden         Europe          2005     $4B   

                                    Select Investors  
0  Sequoia Capital China, SIG Asia Investments, S...  
1  Founders Fund, Draper Fishe

In [12]:
df.isnull().values.sum()

0

In [13]:
df.shape

(1074, 10)

## Creating new Dataset analyse the most active Investor

In [17]:
import pandas as pd

def main():
    investor_list = df['Select Investors'].str.split(', ').explode()
    investor_counts = investor_list.value_counts()
    investor_frequency_df = pd.DataFrame(investor_counts.items(), columns=['Investor', 'Frequency']).sort_values(by='Frequency', ascending=False).reset_index(drop=True)
    print(investor_frequency_df.head(10))
if __name__ == '__main__':
    main()

                      Investor  Frequency
0                        Accel         60
1      Tiger Global Management         53
2          Andreessen Horowitz         53
3        Sequoia Capital China         48
4             Insight Partners         47
5              Sequoia Capital         47
6  Lightspeed Venture Partners         34
7               SoftBank Group         34
8             General Catalyst         33
9               Index Ventures         32


### Removing 'dollar' and 'B' and creating new column as Valuation (in million dollars)

In [20]:
def convert_valuation_to_millions(valuation):
    if pd.isnull(valuation):
        return None
    valuation = valuation.upper().replace('$', '')
    if 'B' in valuation:
        return float(valuation.replace('B', '')) * 1000  # Convert billion to million
    elif 'M' in valuation:
        return float(valuation.replace('M', ''))
    else:
        return float(valuation)

df['Valuation (in $M)'] = df['Valuation'].apply(convert_valuation_to_millions)
df['Date Joined'] = pd.to_datetime(df['Date Joined'], format='%Y-%m-%d')
df[['Company', 'Valuation', 'Valuation (in $M)', 'Date Joined']].head()

Unnamed: 0,Company,Valuation,Valuation (in $M),Date Joined
0,Bytedance,$180B,180000.0,2017-04-07
1,SpaceX,$100B,100000.0,2012-12-01
2,SHEIN,$100B,100000.0,2018-07-03
3,Stripe,$95B,95000.0,2014-01-23
4,Klarna,$46B,46000.0,2011-12-12


### Extract year from "Date Joined" column and add a new column "Year Joined"

In [21]:
df['Year Joined'] = pd.to_datetime(df['Date Joined']).dt.year

In [22]:
df.head(3)

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country,Continent,Year Founded,Funding,Select Investors,Valuation (in $M),Year Joined
0,Bytedance,$180B,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S...",180000.0,2017
1,SpaceX,$100B,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen...",100000.0,2012
2,SHEIN,$100B,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China...",100000.0,2018
