# Predicting the Success of Startups

Our goal is to utilize the data analysis skills that we have been building throughout the semester and primarily assess what makes startups successful and why. Success, in this case, is defined by either the acquisition of the startup by another company (M&A) or by the startup going public (IPO). This research goal has scope to dive into many deeper underlying questions. 

**Questions include:**
- What factors of a startup make it prone to additional rounds of VC funding
- How does that impact their revenue and overall goal? 
- What is the probability that a startup becomes successful given the amount of funding they have received? 
- How does the age of a startup play a role in funding and success?



In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
# Run this code to load the data
startups = pd.read_csv('startup-data.csv', index_col=0)
print(startups.shape)
startups.head()

(923, 48)


Unnamed: 0_level_0,state_code,latitude,longitude,zip_code,id,city,Unnamed: 6,name,labels,founded_at,...,object_id,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1005,CA,42.35888,-71.05682,92101,c:6669,San Diego,,Bandsintown,1,1/1/2007,...,c:6669,0,1,0,0,0,0,1.0,0,acquired
204,CA,37.238916,-121.973718,95032,c:16283,Los Gatos,,TriCipher,1,1/1/2000,...,c:16283,1,0,0,1,1,1,4.75,1,acquired
1001,CA,32.901049,-117.192656,92121,c:65620,San Diego,San Diego CA 92121,Plixi,1,3/18/2009,...,c:65620,0,0,1,0,0,0,4.0,1,acquired
738,CA,37.320309,-122.05004,95014,c:42668,Cupertino,Cupertino CA 95014,Solidcore Systems,1,1/1/2002,...,c:42668,0,0,0,1,1,1,3.3333,1,acquired
1002,CA,37.779281,-122.419236,94105,c:65806,San Francisco,San Francisco CA 94105,Inhale Digital,0,8/1/2010,...,c:65806,1,1,0,0,0,0,1.0,1,closed


In [3]:
startups.isnull().sum()

state_code                    0
latitude                      0
longitude                     0
zip_code                      0
id                            0
city                          0
Unnamed: 6                  493
name                          0
labels                        0
founded_at                    0
closed_at                   588
first_funding_at              0
last_funding_at               0
age_first_funding_year        0
age_last_funding_year         0
age_first_milestone_year    152
age_last_milestone_year     152
relationships                 0
funding_rounds                0
funding_total_usd             0
milestones                    0
state_code.1                  1
is_CA                         0
is_NY                         0
is_MA                         0
is_TX                         0
is_otherstate                 0
category_code                 0
is_software                   0
is_web                        0
is_mobile                     0
is_enter

In [4]:
startups = startups.drop(['Unnamed: 6'], axis=1)

In [5]:
startups.isnull().sum()

state_code                    0
latitude                      0
longitude                     0
zip_code                      0
id                            0
city                          0
name                          0
labels                        0
founded_at                    0
closed_at                   588
first_funding_at              0
last_funding_at               0
age_first_funding_year        0
age_last_funding_year         0
age_first_milestone_year    152
age_last_milestone_year     152
relationships                 0
funding_rounds                0
funding_total_usd             0
milestones                    0
state_code.1                  1
is_CA                         0
is_NY                         0
is_MA                         0
is_TX                         0
is_otherstate                 0
category_code                 0
is_software                   0
is_web                        0
is_mobile                     0
is_enterprise                 0
is_adver

In [6]:
print(startups['state_code.1']==startups['state_code'])
print(startups['id']==startups['object_id'])

Unnamed: 0
1005    True
204     True
1001    True
738     True
1002    True
        ... 
352     True
721     True
557     True
589     True
462     True
Length: 923, dtype: bool
Unnamed: 0
1005    True
204     True
1001    True
738     True
1002    True
        ... 
352     True
721     True
557     True
589     True
462     True
Length: 923, dtype: bool


In [7]:
startups = startups.drop(['state_code.1'], axis=1)
startups = startups.drop(['object_id'], axis=1)
start = ['c:']
end = ['']
startups['id'] = startups['id'].replace(start, end, regex=True)
startups.avg_participants = startups.avg_participants.round(4)

In [8]:
# startups['closed_at'] = startups['closed_at'].fillna('not closed')
startups['age_first_milestone_year'] = startups['age_first_milestone_year'].fillna(0)
startups['age_last_milestone_year'] = startups['age_last_milestone_year'].fillna(0)
startups.head()

Unnamed: 0_level_0,state_code,latitude,longitude,zip_code,id,city,name,labels,founded_at,closed_at,...,is_othercategory,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1005,CA,42.35888,-71.05682,92101,6669,San Diego,Bandsintown,1,1/1/2007,,...,1,0,1,0,0,0,0,1.0,0,acquired
204,CA,37.238916,-121.973718,95032,16283,Los Gatos,TriCipher,1,1/1/2000,,...,0,1,0,0,1,1,1,4.75,1,acquired
1001,CA,32.901049,-117.192656,92121,65620,San Diego,Plixi,1,3/18/2009,,...,0,0,0,1,0,0,0,4.0,1,acquired
738,CA,37.320309,-122.05004,95014,42668,Cupertino,Solidcore Systems,1,1/1/2002,,...,0,0,0,0,1,1,1,3.3333,1,acquired
1002,CA,37.779281,-122.419236,94105,65806,San Francisco,Inhale Digital,0,8/1/2010,10/1/2012,...,0,1,1,0,0,0,0,1.0,1,closed


In [9]:
#make a new column for years of the startup/how long it has been around
startups['closed_at'] = pd.to_datetime(startups['closed_at'])
startups['founded_at'] = pd.to_datetime(startups['founded_at'])

startups["age"] = (startups["closed_at"]-startups["founded_at"])
startups["age"]=round(startups.age/np.timedelta64(1,'Y'))

In [10]:
startups.head()

Unnamed: 0_level_0,state_code,latitude,longitude,zip_code,id,city,name,labels,founded_at,closed_at,...,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status,age
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1005,CA,42.35888,-71.05682,92101,6669,San Diego,Bandsintown,1,2007-01-01,NaT,...,0,1,0,0,0,0,1.0,0,acquired,
204,CA,37.238916,-121.973718,95032,16283,Los Gatos,TriCipher,1,2000-01-01,NaT,...,1,0,0,1,1,1,4.75,1,acquired,
1001,CA,32.901049,-117.192656,92121,65620,San Diego,Plixi,1,2009-03-18,NaT,...,0,0,1,0,0,0,4.0,1,acquired,
738,CA,37.320309,-122.05004,95014,42668,Cupertino,Solidcore Systems,1,2002-01-01,NaT,...,0,0,0,1,1,1,3.3333,1,acquired,
1002,CA,37.779281,-122.419236,94105,65806,San Francisco,Inhale Digital,0,2010-08-01,2012-10-01,...,1,1,0,0,0,0,1.0,1,closed,2.0
