In [78]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px


In [79]:
companies=pd.read_csv(r"C:\Users\sijan\Downloads\Unicorn_Companies.csv")

In [80]:
# Run this cell so pandas displays all columns
pd.set_option('display.max_columns', None)

In [81]:
companies.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors
0,Bytedance,$180B,4/7/17,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,$100B,12/1/12,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,$100B,7/3/18,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,$95B,1/23/14,Fintech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG"
4,Klarna,$46B,12/12/11,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita..."


In [82]:
companies.dtypes

Company             object
Valuation           object
Date Joined         object
Industry            object
City                object
Country/Region      object
Continent           object
Year Founded         int64
Funding             object
Select Investors    object
dtype: object

In [83]:
companies['Date Joined'] = pd.to_datetime(companies['Date Joined'])

  companies['Date Joined'] = pd.to_datetime(companies['Date Joined'])


In [84]:
#lets find out how many years did it take them to become billion dollars companies 
companies['Years To Unicorn'] = companies['Date Joined'].dt.year - companies['Year Founded']
companies[['Company','Years To Unicorn']].head()

Unnamed: 0,Company,Years To Unicorn
0,Bytedance,5
1,SpaceX,10
2,SHEIN,10
3,Stripe,4
4,Klarna,6


In [85]:
# let's check if there are any rows having negative values in Years To Unicorn column because those are the error values
companies[companies['Years To Unicorn']<0]

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn
714,Yidian Zixun,$1B,2017-10-17,Mobile & telecommunications,Beijing,China,Asia,2021,$151M,"Phoenix New Media, Tianjin Haihe Industry Fund",-4


In [86]:
#as found there is one company with negative value in Years to Unicorn Column
# it was found there was a typing error and it was types incorrectly as 2021 instead of 2011 in Year Founded column
companies.loc[companies['Company']=='Yidian Zixun','Year Founded']=2011
companies[companies['Company']=="Yidian Zixun"]

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn
714,Yidian Zixun,$1B,2017-10-17,Mobile & telecommunications,Beijing,China,Asia,2011,$151M,"Phoenix New Media, Tianjin Haihe Industry Fund",-4


In [87]:
# now let's again run the code to find the years taken to see if the rectification is made 
companies['Years To Unicorn'] = companies['Date Joined'].dt.year - companies['Year Founded']


In [88]:

companies[companies['Years To Unicorn']<0]

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn


In [89]:
#there are no rows so we can be sure that necessary rectification were made


In [90]:
# List provided by the company of the expected industry labels in the data
industry_list = ['Artificial intelligence', 'Other','E-commerce & direct-to-consumer', 'Fintech',\
       'Internet software & services','Supply chain, logistics, & delivery', 'Consumer & retail',\
       'Data management & analytics', 'Edtech', 'Health', 'Hardware','Auto & transportation', \
        'Travel', 'Cybersecurity','Mobile & telecommunications']

In [91]:
set(companies['Industry']) - set(industry_list)

{'Artificial Intelligence'}

In [92]:
# 1. Create `replacement_dict`
\
replacement_dict = {'Artificial Intelligence': 'Artificial intelligence'}
                

# 2. Replace the incorrect values in the `Industry` column


companies['Industry'] = companies['Industry'].replace(replacement_dict)

# 3. Verify that there are no longer any elements in `Industry` that are not in `industry_list`

set(companies['Industry']) - set(industry_list)

set()

In [93]:
#lets isolate rows of all the companies that have duplicates 
companies[companies.duplicated(subset=['Company'], keep=False)]

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn
40,Bolt,$11B,2018-05-29,Auto & transportation,Tallinn,Estonia,Europe,2013,$1B,"Didi Chuxing, Diamler, TMT Investments",5
44,Bolt,$11B,2021-10-08,Fintech,San Francisco,United States,North America,2014,$1B,"Activant Capital, Tribe Capital, General Atlantic",7


In [94]:
# let's keep only the first occurence of each duplicate 
companies = companies.drop_duplicates(subset=['Company'], keep='first')



The input validation steps for this lab included:

Fixing incorrect values
Correcting inconsistencies in the data
Removing duplicate data

In [105]:
companies.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn
0,Bytedance,,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S...",5
1,SpaceX,,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen...",10
2,SHEIN,,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China...",10
3,Stripe,,2014-01-23,Fintech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG",4
4,Klarna,,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita...",6


In [106]:
companies['Valuation'].count()


0

In [107]:
companies['Valuation'] = pd.to_numeric(companies['Valuation'], errors='coerce')

In [108]:
non_numeric_values = companies[~companies['Valuation'].apply(pd.to_numeric, errors='coerce').notna()]
print(non_numeric_values)


          Company  Valuation Date Joined                         Industry  \
0       Bytedance        NaN  2017-04-07          Artificial intelligence   
1          SpaceX        NaN  2012-12-01                            Other   
2           SHEIN        NaN  2018-07-03  E-commerce & direct-to-consumer   
3          Stripe        NaN  2014-01-23                          Fintech   
4          Klarna        NaN  2011-12-12                          Fintech   
...           ...        ...         ...                              ...   
1069     Zhaogang        NaN  2017-06-29  E-commerce & direct-to-consumer   
1070  Zhuan Zhuan        NaN  2017-04-18  E-commerce & direct-to-consumer   
1071     Zihaiguo        NaN  2021-05-06                Consumer & retail   
1072         Zopa        NaN  2021-10-19                          Fintech   
1073        Zwift        NaN  2020-09-16  E-commerce & direct-to-consumer   

               City  Country/Region      Continent  Year Founded Funding  \

In [109]:
missing_values = companies[companies['Valuation'].isnull()]
print(missing_values)


          Company  Valuation Date Joined                         Industry  \
0       Bytedance        NaN  2017-04-07          Artificial intelligence   
1          SpaceX        NaN  2012-12-01                            Other   
2           SHEIN        NaN  2018-07-03  E-commerce & direct-to-consumer   
3          Stripe        NaN  2014-01-23                          Fintech   
4          Klarna        NaN  2011-12-12                          Fintech   
...           ...        ...         ...                              ...   
1069     Zhaogang        NaN  2017-06-29  E-commerce & direct-to-consumer   
1070  Zhuan Zhuan        NaN  2017-04-18  E-commerce & direct-to-consumer   
1071     Zihaiguo        NaN  2021-05-06                Consumer & retail   
1072         Zopa        NaN  2021-10-19                          Fintech   
1073        Zwift        NaN  2020-09-16  E-commerce & direct-to-consumer   

               City  Country/Region      Continent  Year Founded Funding  \

In [115]:
# Fill missing values with 0
companies['Valuation'].fillna(0, inplace=True)

# Set the threshold value
threshold = 1000000  # For example, $1 million

# Create the 'High Valuation' column based on the threshold
companies['High Valuation'] = np.where(companies['Valuation'] > threshold, 'high', 'low')
print(companies[['Company', 'Valuation', 'High Valuation']])

          Company  Valuation High Valuation
0       Bytedance        0.0            low
1          SpaceX        0.0            low
2           SHEIN        0.0            low
3          Stripe        0.0            low
4          Klarna        0.0            low
...           ...        ...            ...
1069     Zhaogang        0.0            low
1070  Zhuan Zhuan        0.0            low
1071     Zihaiguo        0.0            low
1072         Zopa        0.0            low
1073        Zwift        0.0            low

[1073 rows x 3 columns]


In [116]:
# Create numeric `Continent Number` column

### YOUR CODE HERE ###
continent_dict = {'North America': 1,
                  'Asia': 2,
                  'Europe': 3,
                  'South America': 4,
                  'Oceania': 5,
                  'Africa': 6
                 }
companies['Continent Number'] = companies['Continent'].replace(continent_dict)
companies.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn,High Valuation,Continent Number
0,Bytedance,0.0,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S...",5,low,2
1,SpaceX,0.0,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen...",10,low,1
2,SHEIN,0.0,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China...",10,low,2
3,Stripe,0.0,2014-01-23,Fintech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG",4,low,1
4,Klarna,0.0,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita...",6,low,3


In [117]:
# Create numeric categories for Country/Region
companies['Country/Region Numeric'] = companies['Country/Region'].astype('category').cat.codes

In [118]:

# Create dummy variables with Industry values
industry_encoded = pd.get_dummies(companies['Industry'])

# Combine `companies` DataFrame with new dummy Industry columns
companies = pd.concat([companies, industry_encoded], axis=1)

In [119]:
companies.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn,High Valuation,Continent Number,Country/Region Numeric,Artificial intelligence,Auto & transportation,Consumer & retail,Cybersecurity,Data management & analytics,E-commerce & direct-to-consumer,Edtech,Fintech,Hardware,Health,Internet software & services,Mobile & telecommunications,Other,"Supply chain, logistics, & delivery",Travel
0,Bytedance,0.0,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S...",5,low,2,9,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,SpaceX,0.0,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen...",10,low,1,44,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
2,SHEIN,0.0,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China...",10,low,2,9,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
3,Stripe,0.0,2014-01-23,Fintech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG",4,low,1,44,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
4,Klarna,0.0,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita...",6,low,3,38,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
