# Data exploration

In [281]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import urllib3

Data extracted on Thursday 28th, October 2021 at 1:03 PM from Crunchbase Research.

In [282]:
extracted_year = 2021

## Load tables

In [283]:
# Tables we will use
people_df = pd.read_csv('data/people.csv')
organizations_df = pd.read_csv('data/organizations.csv')
jobs_df = pd.read_csv('data/jobs.csv')
investors_df = pd.read_csv('data/investors.csv')
funding_rounds_df = pd.read_csv('data/funding_rounds.csv')
degrees_df = pd.read_csv('data/degrees.csv')
category_groups_df = pd.read_csv('data/category_groups.csv')

In [284]:
# Useless tables
organization_descriptions_df = pd.read_csv('data/organization_descriptions.csv')
people_descriptions_df = pd.read_csv('data/people_descriptions.csv')
check_sum_df = pd.read_csv('data/checksum.csv')
event_appearances_df = pd.read_csv('data/event_appearances.csv')
ipos_df = pd.read_csv('data/ipos.csv')
acquisitions_df = pd.read_csv('data/acquisitions.csv')
investments_df = pd.read_csv('data/investments.csv')
funds_df = pd.read_csv('data/funds.csv')
org_parents = pd.read_csv('data/org_parents.csv')
events = pd.read_csv('data/events.csv')
investment_partners_df = pd.read_csv('data/investment_partners.csv')

## Preliminary table selection

The goal of this step is to figure which tables contain meaningful data for our analysis. The tables that do not make the cut will not be used further.

### People-related tables

In [285]:
# print(people_df.columns.values)
# people_df.head()

The table <code>people_df</code> contains potentially good information like rank, region, city, etc. The information in this table is contained by other tables like <code>jobs_df</code> and <code>degrees_df</code>.

In [286]:
#print(people_descriptions_df.columns.values)
#people_descriptions_df.head()

The table <code>people_descriptions_df</code> mostly contains text description of what each company does. While this might be useful for a model that leverages NLP, that is out of the scope of our project

In [287]:
# print(ipos_df.columns.values)
#ipos_df.head()

The table <code>jobs_df</code> contains useful information regarding the people that work on the company. We can leverage this table to find information regarding the founders.

In [288]:
# print(degrees_df.columns.values)
# degrees_df.head()

The table <code>degrees_df</code> contains information regarding the degrees of people. We can use this information to see where founders and early employees went to school.

### Organization-related tables

In [289]:
# print(organizations_df.columns.values)
# organizations_df.head()

The table <code>organizations_df</code> contains potentially good information like rank, status, category_list, total_funding etc. Furthermore, we can join this table with some other ones like <code>people_df</code> or <code>investors_df</code>. We will keep using this so far.

In [290]:
# print(organization_descriptions_df.columns.values)
# organization_descriptions_df.head()

The table <code>organization_descriptions_df</code> mostly contains text description of what each company does. While this might be useful for a model that leverages NLP, that is out of the scope of our project.

In [291]:
# print(org_parents_df.columns.values)
# org_parents_df.head()

The table <code>org_parents_df</code> mostly contains information about parent and child companies. This information is not relevant for our analysis.

In [292]:
# print(jobs_df.columns.values)
# jobs_df.head()

The table <code>ipos_df</code> is an extremely important one as it contains useful information about companies that exited through an IPO.

In [293]:
# print(acquisitions_df.columns.values)
# acquisitions_df.head()

We do not need the table <code>acquisitions_df</code> as <code>organizations_df</code> already tells us whether a company has been acquired.

### Investment/funding-related tables

In [294]:
# print(investors_df.columns.values)
# investors_df.head()

The table <code>investors_df</code> contains information regarding investors including domain, total money invested, investment count, etc. We will use it in our analysis.

In [295]:
# print(investments_df.columns.values)
# investments_df.head()

The table <code>investments_df</code> contains information regarding particular investments including funding round, investor name, etc. While this information might be useful, it is already included in other tables like <code>funding_rounds_df</code> and <code>investors_df</code>

In [296]:
# print(funding_rounds_df.columns.values)
# funding_rounds_df.head()

The table <code>funding_rounds_df</code> contains information regarding particular founding rounds including investor name, investor count, year, raised amount, etc. This table might come in handy later.

In [297]:
# print(funds_df.columns.values)
# funds_df.head()

The table <code>funds_df</code> contains information regarding investment funds including domain, total money raised, investment count, etc. This information is not relevant for our analysis as it does not shed any light on investment decisions.

In [298]:
# print(investment_partners_df.columns.values)
# investment_partners_df.head()

The table <code>investment_partners</code> contains information regarding the partners that led each investment. While this information might be really interesting, we are not looking for this level of granularity in our analysis. Because of that, we will not use this table moving forward.

### Events-related tables

The following tables include information about different entrepreneurship/tech events awnd which companies attended. We beleive

In [299]:
# print(event_appearances_df.columns.values)
# event_appearances_df.head()

In [300]:
# print(events_df.columns.values)
# events.head()

## Preliminary data cleaning

### Some helper functions

In [301]:
# Define a function that checks if an entry is positions
def check_position(entry, positions,s=' '):
    ans = [word.lower() in positions for word in str(entry).split(s)]
    return min(1, sum(ans))

# Define a function that checks if a company was founded within a range
def founded_on(entry, years):
    year = int(entry[0:4])
    return years[0] <= year and year <= years[1]

### Organizations_df

In [302]:
organizations_df = organizations_df.drop(columns=['permalink','cb_url','created_at','updated_at',
                                                  'legal_name','phone','short_description','email',
                                                  'phone','facebook_url','linkedin_url','twitter_url',
                                                  'logo_url','alias1','alias2','alias3','primary_role',
                                                  'num_exits','rank','total_funding','total_funding_currency_code']) 

In [303]:
# Get organizations in the USA
cond1 = organizations_df['country_code'] == 'USA'

# Organizations_dfies founded between 1995 and 2000 in the USA

# We drop all those with nan values
organizations_df = organizations_df.dropna(subset=['founded_on'])
founded_95_15 = lambda x: founded_on(x,[1995,2015])
cond2 = organizations_df['founded_on'].map(founded_95_15)

organizations_df = organizations_df[cond1 & cond2]

# Only keep companies, filter out all investment funds
check_company = lambda x: check_position(x,['company'],',')

cond = organizations_df['roles'].map(check_company).astype(bool)
organizations_df = organizations_df[cond]

  organizations_df = organizations_df[cond1 & cond2]


### Jobs_df

In [304]:
jobs_df = jobs_df.drop(columns=['permalink','cb_url','rank','created_at','updated_at'])
jobs_df['started_on']= pd.to_datetime(jobs_df['started_on'],errors = 'coerce')

In [305]:
# We drop titles with NaN
jobs_df = jobs_df[jobs_df['title'].notna()]

# Define functions to check if_founder or if_ceo
check_founder = lambda x: check_position(x, ['founder','co-founder'])
check_ceo = lambda x: check_position(x, ['ceo'])

# Create new columns for is_founder and is_ceo
jobs_df['is_founder'] = jobs_df['title'].map(check_founder).astype(bool)
jobs_df['is_current_ceo'] = jobs_df['title'].map(check_ceo) & jobs_df['is_current']

# Drop all but founders and current CEO
cond = jobs_df['is_founder']  | jobs_df['is_current_ceo']
jobs_df = jobs_df[cond.astype(bool)]

# We want to calculate how many are previously founders
founders_df = jobs_df[jobs_df['is_founder'] == 1]
serial_founders_df = founders_df.groupby(by=['person_uuid'])\
                                .agg({"org_uuid": pd.Series.nunique, 'started_on':'min'})\
                                .reset_index()\
                                .rename(columns={'org_uuid':'number_founded', 'started_on':'first_venture_on'})

# Merge tables
people_df = jobs_df.merge(serial_founders_df,how='left',on='person_uuid')

# We determine whether a founder was a veteran founder at the time each venture was founded
cond1 = jobs2_df['is_founder'] == True
cond2 = jobs2_df['number_founded'] > 1
cond3 = jobs2_df['first_venture_on'] < jobs2_df['started_on']

people_df['veteran_founder'] = cond1 & cond2 & cond3

# Get rid of unnecessary columns
people_df = people_df.drop(columns=['uuid','name','type','person_name','org_name','ended_on','title','job_type'])

In [306]:
people_df

Unnamed: 0,person_uuid,org_uuid,started_on,is_current,is_founder,is_current_ceo,number_founded,first_venture_on,veteran_founder
0,ed13cd36-fe2b-3707-197b-0c2d56e37a71,e1393508-30ea-8a36-3f96-dd3226033abd,2005-10-01,False,True,False,3.0,1999-01-01,True
1,a01b8d46-d311-3333-7c34-aa3ae9c03f22,df662812-7f97-0b43-9d3e-12f64f504fbb,2004-01-01,True,True,True,7.0,2004-01-01,False
2,084aaa07-0795-1fe8-9c46-98bbeb02cd64,df662812-7f97-0b43-9d3e-12f64f504fbb,2004-02-01,False,True,False,4.0,2004-02-01,False
3,5ac8203a-540a-ab6c-46ee-84463834fe72,df662812-7f97-0b43-9d3e-12f64f504fbb,2004-02-01,False,True,False,2.0,2004-02-01,False
4,93f8920e-789b-3f45-0620-556e5c43987e,f53cb4de-236e-0b1b-dee8-7104a8b018f9,2003-07-01,True,True,True,2.0,2003-07-01,False
...,...,...,...,...,...,...,...,...,...
480331,0f08f4de-0a5b-4cc3-95fc-c9838027e8b4,669be14f-ca0d-4448-94db-b5c89df5279a,2018-04-10,False,True,False,1.0,2018-04-10,False
480332,e42d508b-9adc-a4da-a080-db3f82d93170,9357d833-5bbc-4224-9a42-3d81c36f6476,2019-05-28,True,False,True,2.0,2016-07-01,False
480333,f90264dd-e9db-d759-d12c-2c2309c04a1a,a86f9d4f-3ab8-4b9a-807e-aea7de4a121d,2016-09-01,True,True,False,2.0,2011-09-01,True
480334,86817e2d-068e-4317-a68e-44f926ddbea6,a8b940d2-89f3-46f9-80ed-69cf4ceea3ac,2020-11-01,True,True,False,2.0,2020-11-01,False


### Investors_df

In [307]:
investors_df = investors_df.drop(columns=['permalink','cb_url','rank','created_at',
                                          'updated_at','domain','country_code','state_code','region','city',
                                          'founded_on','closed_on','facebook_url',
                                          'linkedin_url','twitter_url','logo_url',
                                          'total_funding_usd','total_funding','total_funding_currency_code',
                                          'name','type','roles','investor_types'])

In [308]:
# Rename columns
investors_df = investors_df.rename(columns={'uuid':'lead_investor_uuid','investment_count':'lead_investment_count'})

In [309]:
#investors_df

### Funding_rounds_df

In [310]:
funding_rounds_df = funding_rounds_df.drop(columns=['permalink','cb_url','rank','created_at',
                                                    'updated_at','country_code','state_code',
                                                    'region','city','announced_on','post_money_valuation',
                                                    'post_money_valuation_currency_code',
                                                    'raised_amount','raised_amount_currency_code','post_money_valuation_usd',
                                                    'name','org_name','type'])

In [311]:
# Exclude all rounds taking place after Series B
rounds = ['angel', 'seed', 'series_a', 'series_b']
check_rounds = lambda x: x in rounds

cond = funding_rounds_df['investment_type'].map(check_rounds)
funding_rounds_df = funding_rounds_df[cond]

# We create an indicator variable to see if an investment round had a lead_investor
# True means the round had a lead investor
funding_rounds_df['lead_investor_ind'] = ~(funding_rounds_df['lead_investor_uuids'].isna())

# Rename columns
funding_rounds_df = funding_rounds_df.rename(columns={'uuid':'rounds_uuid','lead_investor_uuids':'lead_investor_uuid'})

# Convert investment_type into dummies
funding_rounds_df = funding_rounds_df = pd.get_dummies(funding_rounds_df,columns=['investment_type'])

In [312]:
#funding_rounds_df

### Degrees_df

In [313]:
degrees_df = degrees_df.drop(columns=['permalink','cb_url','rank','created_at',
                                      'updated_at','institution_uuid','institution_name','subject',
                                      'name','person_name','institution_uuid','institution_name',
                                      'degree_type','subject','started_on'])

In [314]:
# Convert our dates into date_data
# Convert invalid dates to NaT
degrees_df['completed_on'] = pd.to_datetime(degrees_df['completed_on'],errors = 'coerce')

# Find whether a person has completed a degree, how many degrees they have attempted, and the earliest completion date
degrees_df = degrees_df.groupby(by=['person_uuid']).agg({'uuid':'count', 'is_completed':'sum', 'completed_on':'min'}).reset_index()
degrees_df = degrees_df.rename(columns={'uuid':'num_degs_attempted', 'is_completed':'num_degs_finished', 'completed_on':'first_deg_completed_date'})

In [315]:
degrees_df

Unnamed: 0,person_uuid,num_degs_attempted,num_degs_finished,first_deg_completed_date
0,000020dc-18ce-7f7b-e8c4-8f5d716ad09d,3,1,2012-01-01
1,00002852-4f2a-473a-ae63-810fa8d3f31f,1,1,2017-01-01
2,00006aa5-68cc-7430-eb3d-9bd8305dcb4d,2,0,NaT
3,0000a1a4-5804-cf06-d23e-cecc8017b220,2,0,NaT
4,0000bf3d-7645-e2ef-ec7b-0408a9b2f879,2,0,NaT
...,...,...,...,...
298766,fffedd65-2f76-4175-80db-e516c8357794,1,0,NaT
298767,fffee386-822f-4f78-a61d-6ee443ee0c15,1,1,1989-01-01
298768,fffefc19-5a64-6535-17c2-3155be76a204,1,0,NaT
298769,ffffd444-7490-e683-5277-aef4c3461614,3,0,NaT


### Category_groups_df

In [316]:
category_groups_df = category_groups_df.drop(columns=['permalink','cb_url','rank','created_at','updated_at'])

In [317]:
category_groups_df

Unnamed: 0,uuid,name,type,category_groups_list
0,f9b14a15-5517-8f38-0562-729ebb54dfdb,Homeless Shelter,category,Other
1,f8320fcf-b657-37d2-1495-daa3ad888ece,Freemium,category,Other
2,f0193199-a968-b457-eb49-95344e22a5ce,Industrial,category,Manufacturing
3,ed8217ff-bd11-26fe-d82e-eb98a276dc2e,Innovation Management,category,Professional Services
4,e9794581-9547-2150-8185-7b747f5c9913,Lighting,category,Hardware
...,...,...,...,...
739,68413bd1-e42d-4cf1-815b-62e64a6b922a,Quantum Computing,category,Science and Engineering
740,3e0e2772-75e9-4f7b-aa6d-a8dc3dc9e7c6,Marine Technology,category,Science and Engineering
741,ce11fd97-65a0-4f8c-b385-2ca48f444479,Smart Cities,category,Real Estate
742,b561fa84-cb20-4a82-a6bc-fea613d74cdd,Last Mile Transportation,category,Transportation


### Merging to obtain final dataset

In [318]:
organizations_df

Unnamed: 0,uuid,name,type,roles,domain,homepage_url,country_code,state_code,region,city,...,postal_code,status,category_list,category_groups_list,num_funding_rounds,total_funding_usd,founded_on,last_funding_on,closed_on,employee_count
0,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,organization,company,wetpaint.com,http://www.wetpaint.com/,USA,NY,New York,New York,...,10010,acquired,"Publishing,Social Media,Social Media Management","Content and Publishing,Internet Services,Media...",3.0,3.975000e+07,2005-06-01,2008-05-19,,51-100
1,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,organization,"investor,company",zoho.com,https://www.zoho.com/,USA,CA,California,Pleasanton,...,94588,operating,"Cloud Computing,Collaboration,CRM,Developer To...","Administrative Services,Information Technology...",,,1996-09-15,,,1001-5000
2,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,organization,company,digg.com,http://www.digg.com,USA,NY,New York,New York,...,,acquired,"Internet,Social Media,Social Network","Internet Services,Media and Entertainment",6.0,4.900000e+07,2004-10-11,2016-09-13,,51-100
4,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,organization,"investor,company",facebook.com,http://www.facebook.com,USA,CA,California,Menlo Park,...,94025,ipo,"Mobile Apps,Photo Sharing,Social Media,Social ...","Apps,Content and Publishing,Internet Services,...",16.0,1.612282e+10,2004-02-04,2014-10-20,,10000+
8,60485007-8856-bbac-aa1b-c535c41f5f47,Omnidrive,organization,company,omnidrive.com,http://www.omnidrive.com,USA,CA,California,Palo Alto,...,94301,closed,"E-Commerce,File Sharing,Internet,Social Media,...","Commerce and Shopping,Internet Services,Media ...",1.0,8.000000e+05,2005-11-01,2006-12-01,2008-09-15,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1642150,9e8f537a-ae5d-4b53-a99a-c794736de0e9,Dearmitt Financial Group LLC,organization,company,dearmittfg.com,https://www.dearmittfg.com/,USA,IN,Indiana,Franklin,...,46131,operating,"Advice,Consulting,Financial Services","Financial Services,Media and Entertainment,Pro...",,,2006-09-01,,,11-50
1642153,587a5a70-09b1-4ceb-a559-a131eee78dc0,Adler Real Estate Partners,organization,company,adler-partners.com,https://adler-partners.com/,USA,FL,Florida,Miami,...,33131,operating,"Commercial,Commercial Real Estate,Real Estate","Other,Real Estate",,,2012-01-01,,,11-50
1642154,118af019-6af0-46d1-8774-67ccd757b3c2,Arctic Fox,organization,company,arcticfoxhaircolor.com,https://arcticfoxhaircolor.com/,USA,CA,California,Cerritos,...,90703,operating,"Beauty,Commercial,Cosmetics,E-Commerce","Commerce and Shopping,Consumer Goods,Other",,,2014-01-01,,,11-50
1642156,8033c823-373f-4087-bb1c-6cfa6adbe821,Creative Commercial Group,organization,company,creativecommercialgroup.com,http://www.creativecommercialgroup.com/,USA,CA,California,San Rafael,...,94901,operating,"Commercial,Commercial Real Estate,Finance,Fina...","Financial Services,Other,Real Estate",,,2013-01-01,,,11-50


In [319]:
# Merge tables to fully include degree and job information
full_people_df = people_df.merge(degrees_df,how='inner',on='person_uuid')

In [320]:
# Merge tables to fully include lead investor information
rounds_df = funding_rounds_df.merge(investors_df,how='left',on='lead_investor_uuid')

In [321]:
angel_df = rounds_df[rounds_df['investment_type_angel'] == 1].drop(columns=['investment_type_seed','investment_type_series_a',
                                                                                      'investment_type_series_b','rounds_uuid'])
angel_df = angel_df.rename(columns={'raised_amount_usd':'angel_raised_amount','investor_count':'angel_investor_count',
                            'lead_investor_uuid':'angel_lead_investor','lead_investor_ind':'angel_investor_ind',
                            'investment_count':'angel_investment_count','lead_investment_count':'lead_angel_count'})

seed_df = rounds_df[rounds_df['investment_type_seed'] == 1].drop(columns=['investment_type_angel','investment_type_series_a',
                                                                                      'investment_type_series_b','rounds_uuid'])
seed_df = seed_df.rename(columns={'raised_amount_usd':'seed_raised_amount','investor_count':'seed_investor_count',
                            'lead_investor_uuid':'seed_lead_investor','lead_investor_ind':'seed_investor_ind','lead_investment_count':'lead_seed_count',
                            'investment_count':'seed_investment_count'})

SeriesA_df = rounds_df[rounds_df['investment_type_series_a'] == 1].drop(columns=['investment_type_seed','investment_type_angel',
                                                                                      'investment_type_series_b','rounds_uuid'])
SeriesA_df = SeriesA_df.rename(columns={'raised_amount_usd':'series_a_raised_amount','investor_count':'series_a_investor_count',
                            'lead_investor_uuid':'series_a_lead_investor','lead_investor_ind':'series_a_investor_ind','lead_investment_count':'lead_series_a_count',
                            'investment_count':'series_a_investment_count'})

SeriesB_df = rounds_df[rounds_df['investment_type_series_b'] == 1].drop(columns=['raised_amount_usd','investor_count','org_uuid','lead_investor_uuid',
                                                                                           'lead_investor_ind','investment_type_angel','investment_type_seed',
                                                                                           'investment_type_series_a','lead_investment_count'])

In [322]:
full_rounds_df = angel_df.merge(seed_df,how='left',on='org_uuid')
full_rounds_df = full_rounds_df.merge(SeriesA_df,how='left',on='org_uuid')
full_rounds_df = full_rounds_df.merge(SeriesB_df,how='left',on='org_uuid')

KeyError: 'org_uuid'

In [None]:
full_rounds_df.to_csv('clean_data/rounds.csv')
organizations_df.to_csv('clean_data/organizations.csv')
full_people_df.to_csv('clean_data/people.csv')

In [None]:
"""
# Create a bar plot for companies per state

to_plot_states = organizations_df.groupby(by='state_code').count().reset_index()[['state_code','uuid']]
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(1,1, figsize=(10,5))

ax = sns.barplot(x="state_code", y="uuid", data=to_plot_states)
ax.set_xticklabels(to_plot_states["state_code"],rotation=45,fontsize=10)

plt.show()
"""