# Data exploration

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import urllib3

Data extracted on Thursday 28th, October 2021 at 1:03 PM from Crunchbase Research.

In [3]:
extracted_year = 2021

## Load tables

In [4]:
# Tables we will use
people_df = pd.read_csv('data/people.csv')
organizations_df = pd.read_csv('data/organizations.csv')
jobs_df = pd.read_csv('data/jobs.csv')
ipos_df = pd.read_csv('data/ipos.csv')
investors_df = pd.read_csv('data/investors.csv')
investments_df = pd.read_csv('data/investments.csv')
funding_rounds_df = pd.read_csv('data/funding_rounds.csv')
degrees_df = pd.read_csv('data/degrees.csv')
acquisitions_df = pd.read_csv('data/acquisitions.csv')
category_groups_df = pd.read_csv('data/category_groups.csv')

In [None]:
# Useless tables
organization_descriptions_df = pd.read_csv('data/organization_descriptions.csv')
people_descriptions_df = pd.read_csv('data/people_descriptions.csv')
check_sum_df = pd.read_csv('data/checksum.csv')
event_appearances_df = pd.read_csv('data/event_appearances.csv')
funds_df = pd.read_csv('data/funds.csv')
org_parents = pd.read_csv('data/org_parents.csv')
events = pd.read_csv('data/events.csv')
investment_partners_df = pd.read_csv('data/investment_partners.csv')

## Preliminary table selection

The goal of this step is to figure which tables contain meaningful data for our analysis. The tables that do not make the cut will not be used further.

### People-related tables

In [None]:
# print(people_df.columns.values)
# people_df.head()

The table <code>people_df</code> contains potentially good information like rank, region, city, etc. Furthermore, we can join this table with some other ones like <code>organizations_df</code> or <code>investors_df</code>. We will keep using this so far.

In [None]:
#print(people_descriptions_df.columns.values)
#people_descriptions_df.head()

The table <code>people_descriptions_df</code> mostly contains text description of what each company does. While this might be useful for a model that leverages NLP, that is out of the scope of our project

In [None]:
# print(ipos_df.columns.values)
#ipos_df.head()

The table <code>jobs_df</code> contains useful information regarding the people that work on the company. We can leverage this table to find information regarding the founders.

In [None]:
# print(degrees_df.columns.values)
# degrees_df.head()

The table <code>degrees_df</code> contains information regarding the degrees of people. We can use this information to see where founders and early employees went to school.

### Organization-related tables

In [None]:
# print(organizations_df.columns.values)
# organizations_df.head()

The table <code>organizations_df</code> contains potentially good information like rank, status, category_list, total_funding etc. Furthermore, we can join this table with some other ones like <code>people_df</code> or <code>investors_df</code>. We will keep using this so far.

In [None]:
# print(organization_descriptions_df.columns.values)
# organization_descriptions_df.head()

The table <code>organization_descriptions_df</code> mostly contains text description of what each company does. While this might be useful for a model that leverages NLP, that is out of the scope of our project.

In [None]:
# print(org_parents_df.columns.values)
# org_parents_df.head()

The table <code>org_parents_df</code> mostly contains information about parent and child companies. This information is not relevant for our analysis.

In [None]:
# print(jobs_df.columns.values)
# jobs_df.head()

The table <code>ipos_df</code> is an extremely important one as it contains useful information about companies that exited through an IPO.

In [None]:
# print(acquisitions_df.columns.values)
# acquisitions_df.head()

We do not need the table <code>acquisitions_df</code> as <code>organizations_df</code> already tells us whether a company has been acquired.

### Investment/funding-related tables

In [None]:
# print(investors_df.columns.values)
# investors_df.head()

The table <code>investors_df</code> contains information regarding investors including domain, total money invested, investment count, etc. We will use it in our analysis.

In [None]:
# print(investments_df.columns.values)
# investments_df.head()

The table <code>investments_df</code> contains information regarding particular investments including funding round, investor name, etc. This table might come in handy later.

In [None]:
# print(funding_rounds_df.columns.values)
# funding_rounds_df.head()

The table <code>funding_rounds_df</code> contains information regarding particular founding rounds including investor name, investor count, year, raised amount, etc. This table might come in handy later.

In [None]:
# print(funds_df.columns.values)
# funds_df.head()

The table <code>funds_df</code> contains information regarding investment funds including domain, total money raised, investment count, etc. This information is not relevant for our analysis as it does not shed any light on investment decisions.

In [None]:
# print(investment_partners_df.columns.values)
# investment_partners_df.head()

The table <code>investment_partners</code> contains information regarding the partners that led each investment. While this information might be really interesting, we are not looking for this level of granularity in our analysis. Because of that, we will not use this table moving forward.

### Events-related tables

The following tables include information about different entrepreneurship/tech events awnd which companies attended. We beleive

In [None]:
# print(event_appearances_df.columns.values)
# event_appearances_df.head()

In [None]:
# print(events_df.columns.values)
# events.head()

## Preliminary data cleaning

### Some helper functions

In [12]:
# Define a function that checks if an entry is positions
def check_position(entry, positions,s=' '):
    ans = [word.lower() in positions for word in str(entry).split(s)]
    return min(1, sum(ans))


# Define a function that checks if a company was founded within a range
def founded_on(entry, years):
    year = int(entry[0:4])
    return years[0] <= year and year <= years[1]
    
# Define a function that checks if a company url is still active
http = urllib3.PoolManager()
def still_active(url):
    
    print('ga')
    try:
        r = http.request('GET', url)
        
        print(r.status)
        return r.status == 200
    except:
        return False

### People_df

In [15]:
people_df = people_df.drop(columns=['permalink','cb_url','created_at','updated_at',
                                    'state_code','region','city','facebook_url',
                                    'linkedin_url','twitter_url','logo_url'])

"\npeople_df = people_df.drop(columns=['permalink','cb_url','created_at','updated_at',\n                                    'state_code','region','city','facebook_url',\n                                    'linkedin_url','twitter_url','logo_url'])\n"

### Organizations_df

In [17]:
organizations_df = organizations_df.drop(columns=['permalink','cb_url','created_at','updated_at',
                                                  'legal_name','phone','short_description','email',
                                                  'phone','facebook_url','linkedin_url','twitter_url',
                                                  'logo_url','alias1','alias2','alias3','primary_role',
                                                  'num_exits','rank','total_funding','total_funding_currency_code']) 

"\norganizations_df = organizations_df.drop(columns=['permalink','cb_url','created_at','updated_at',\n                                                  'legal_name','phone','short_description','email',\n                                                  'phone','facebook_url','linkedin_url','twitter_url',\n                                                  'logo_url','alias1','alias2','alias3','primary_role',\n                                                  'num_exits','rank','total_funding','total_funding_currency_code'])\n                                                  "

In [18]:
# Do this so we can work efficiently without killing the kernel every 5 min
organizations_df = organizations_df.iloc[:10]

In [19]:
organizations_df

Unnamed: 0,uuid,name,type,roles,domain,homepage_url,country_code,state_code,region,city,...,postal_code,status,category_list,category_groups_list,num_funding_rounds,total_funding_usd,founded_on,last_funding_on,closed_on,employee_count
0,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,organization,company,wetpaint.com,http://www.wetpaint.com/,USA,NY,New York,New York,...,10010.0,acquired,"Publishing,Social Media,Social Media Management","Content and Publishing,Internet Services,Media...",3.0,39750000.0,2005-06-01,2008-05-19,,51-100
1,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,organization,"investor,company",zoho.com,https://www.zoho.com/,USA,CA,California,Pleasanton,...,94588.0,operating,"Cloud Computing,Collaboration,CRM,Developer To...","Administrative Services,Information Technology...",,,1996-09-15,,,1001-5000
2,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,organization,company,digg.com,http://www.digg.com,USA,NY,New York,New York,...,,acquired,"Internet,Social Media,Social Network","Internet Services,Media and Entertainment",6.0,49000000.0,2004-10-11,2016-09-13,,51-100
4,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,organization,"investor,company",facebook.com,http://www.facebook.com,USA,CA,California,Menlo Park,...,94025.0,ipo,"Mobile Apps,Photo Sharing,Social Media,Social ...","Apps,Content and Publishing,Internet Services,...",16.0,16122820000.0,2004-02-04,2014-10-20,,10000+
8,60485007-8856-bbac-aa1b-c535c41f5f47,Omnidrive,organization,company,omnidrive.com,http://www.omnidrive.com,USA,CA,California,Palo Alto,...,94301.0,closed,"E-Commerce,File Sharing,Internet,Social Media,...","Commerce and Shopping,Internet Services,Media ...",1.0,800000.0,2005-11-01,2006-12-01,2008-09-15,unknown
9,4111dc8b-c0df-2d24-ed33-30cd137b3098,Geni,organization,company,geni.com,http://www.geni.com,USA,CA,California,West Hollywood,...,90069.0,acquired,"Collaboration,Communities","Community and Lifestyle,Other",3.0,15000000.0,2006-06-01,2009-01-19,,11-50


In [20]:
# Get companies founded between 1995 and 2000 in the USA
founded_95_15 = lambda x: founded_on(x,[1995,2015])

cond1 = organizations_df['country_code'] == 'USA'
cond2 = organizations_df['founded_on'].map(founded_95_15)
organizations_df = organizations_df[cond1 & cond2]

In [None]:
"""
# Create a bar plot for companies per state

to_plot_states = organizations_df.groupby(by='state_code').count().reset_index()[['state_code','uuid']]
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(1,1, figsize=(10,5))

ax = sns.barplot(x="state_code", y="uuid", data=to_plot_states)
ax.set_xticklabels(to_plot_states["state_code"],rotation=45,fontsize=10)

plt.show()
"""
print('')

In [21]:
# Only keep companies, filter out all investment funds
check_company = lambda x: check_position(x,['company'],',')

cond = organizations_df['roles'].map(check_company).astype(bool)
organizations_df = organizations_df[cond]

In [22]:
# Get which website domains are still working
organizations_df['active_homepage'] = organizations_df['homepage_url'].map(still_active).astype(bool)

ga
404
ga
200
ga
200
ga
200
ga
ga
200


### Jobs_df

In [171]:
jobs_df = jobs_df.drop(columns=['permalink','cb_url','rank','created_at','updated_at'])
jobs_df['started_on']= pd.to_datetime(jobs_df['started_on'])

KeyError: "['permalink' 'cb_url' 'rank' 'created_at' 'updated_at'] not found in axis"

In [86]:
# We drop titles with NaN
jobs_df = jobs_df[jobs_df['title'].notna()]

# Define functions to check if_founder or if_ceo
check_founder = lambda x: check_position(x, ['founder','co-founder'])
check_ceo = lambda x: check_position(x, ['ceo'])

# Create new columns for is_founder and is_ceo
jobs_df['is_founder'] = jobs_df['title'].map(check_founder).astype(bool)
jobs_df['is_current_ceo'] = jobs_df['title'].map(check_ceo) & jobs_df['is_current']

# Drop all but founders and current CEO
cond = jobs_df['is_founder']  | jobs_df['is_current_ceo']
jobs_df = jobs_df[cond.astype(bool)]

# We want to calculate how many are previously founders
founders_df = jobs_df[jobs_df['is_founder'] == 1]
serial_founders_df = founders_df.groupby(by=['person_uuid'])\
                                .agg({"org_uuid": pd.Series.nunique, 'started_on':'min'})\
                                .reset_index()\
                                .rename(columns={'org_uuid':'number_founded', 'started_on':'first_venture_on'})

# Merge tables
jobs2_df = jobs_df.merge(serial_founders_df,how='left',on='person_uuid')

# We determine whether a founder was a veteran founder at the time each venture was founded

cond1 = jobs2_df['is_founder'] == True
cond2 = jobs2_df['number_founded'] > 1
cond3 = jobs2_df['first_on'] < jobs2_df['started_on']

jobs2_df['veteran_founder'] = cond1 & cond2 & cond3

In [167]:
jobs2_df

Unnamed: 0,person_uuid,org_uuid,started_on,is_current,is_founder,is_current_ceo,number_founded,first_on,first_time_founder,veteran_founder
0,ed13cd36-fe2b-3707-197b-0c2d56e37a71,e1393508-30ea-8a36-3f96-dd3226033abd,2005-10-01,False,True,False,3.0,1999-01-01,True,True
1,a01b8d46-d311-3333-7c34-aa3ae9c03f22,df662812-7f97-0b43-9d3e-12f64f504fbb,2004-01-01,True,True,True,7.0,2004-01-01,False,False
2,084aaa07-0795-1fe8-9c46-98bbeb02cd64,df662812-7f97-0b43-9d3e-12f64f504fbb,2004-02-01,False,True,False,4.0,2004-02-01,False,False
3,5ac8203a-540a-ab6c-46ee-84463834fe72,df662812-7f97-0b43-9d3e-12f64f504fbb,2004-02-01,False,True,False,2.0,2004-02-01,False,False
4,93f8920e-789b-3f45-0620-556e5c43987e,f53cb4de-236e-0b1b-dee8-7104a8b018f9,2003-07-01,True,True,True,2.0,2003-07-01,False,False
...,...,...,...,...,...,...,...,...,...,...
480331,0f08f4de-0a5b-4cc3-95fc-c9838027e8b4,669be14f-ca0d-4448-94db-b5c89df5279a,2018-04-10,False,True,False,1.0,2018-04-10,False,False
480332,e42d508b-9adc-a4da-a080-db3f82d93170,9357d833-5bbc-4224-9a42-3d81c36f6476,2019-05-28,True,False,True,2.0,2016-07-01,False,False
480333,f90264dd-e9db-d759-d12c-2c2309c04a1a,a86f9d4f-3ab8-4b9a-807e-aea7de4a121d,2016-09-01,True,True,False,2.0,2011-09-01,True,True
480334,86817e2d-068e-4317-a68e-44f926ddbea6,a8b940d2-89f3-46f9-80ed-69cf4ceea3ac,2020-11-01,True,True,False,2.0,2020-11-01,False,False


In [159]:
# Get rid of unnecessary columns
# jobs2_df = jobs2_df.drop(columns=['uuid','name','type','person_name','org_name','ended_on','title','job_type'])

In [168]:
jobs2_df[jobs2_df['person_uuid'] == 'a01b8d46-d311-3333-7c34-aa3ae9c03f22']

Unnamed: 0,person_uuid,org_uuid,started_on,is_current,is_founder,is_current_ceo,number_founded,first_on,first_time_founder,veteran_founder
1,a01b8d46-d311-3333-7c34-aa3ae9c03f22,df662812-7f97-0b43-9d3e-12f64f504fbb,2004-01-01,True,True,True,7.0,2004-01-01,False,False
62220,a01b8d46-d311-3333-7c34-aa3ae9c03f22,94a457f0-16b9-87a6-970a-735c7f4ddd4e,NaT,True,True,False,7.0,2004-01-01,False,False
83714,a01b8d46-d311-3333-7c34-aa3ae9c03f22,a43794f6-e527-7a20-ea52-ed8025e1244d,NaT,True,True,False,7.0,2004-01-01,False,False
139751,a01b8d46-d311-3333-7c34-aa3ae9c03f22,b3b721ad-502e-a7be-c762-01b4bc172dec,2013-08-01,True,True,False,7.0,2004-01-01,True,True
144427,a01b8d46-d311-3333-7c34-aa3ae9c03f22,308a0538-71fd-c754-3c9d-cc536d613e2a,2015-11-01,True,True,False,7.0,2004-01-01,True,True
164733,a01b8d46-d311-3333-7c34-aa3ae9c03f22,959cb606-3a54-bf70-a50e-8049ebb781be,2009-01-01,True,True,False,7.0,2004-01-01,True,True
266915,a01b8d46-d311-3333-7c34-aa3ae9c03f22,3035d4e6-2588-6e5b-52d3-09f09a026fc9,NaT,False,True,False,7.0,2004-01-01,False,False


### Investors_df

In [35]:
investors_df = investors_df.drop(columns=['permalink','cb_url','rank','created_at',
                                          'updated_at','domain','country_code','state_code','region','city',
                                          'founded_on','closed_on','facebook_url',
                                          'linkedin_url','twitter_url','logo_url',
                                          'total_funding_usd', 'total_funding', 'total_funding_currency_code'])

"\ninvestors_df = investors_df.drop(columns=['permalink','cb_url','rank','created_at',\n                                          'updated_at','domain','country_code','state_code','region','city',\n                                          'founded_on','closed_on','facebook_url',\n                                          'linkedin_url','twitter_url','logo_url',\n                                          'total_funding_usd', 'total_funding', 'total_funding_currency_code'])\n"

In [28]:
investors_df = investors_df.iloc[:100]

In [29]:
investors_df

Unnamed: 0,uuid,name,type,roles,investor_types,investment_count,total_funding_usd,total_funding,total_funding_currency_code
0,ed13cd36-fe2b-3707-197b-0c2d56e37a71,Ben Elowitz,person,investor,angel,1.0,,,
1,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,organization,"investor,company",,6.0,,,
2,f4d5ab44-058b-298b-ea81-380e6e9a8eec,Omidyar Network,organization,investor,family_investment_office,327.0,,,
3,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,organization,"investor,company",,38.0,1.612282e+10,1.612282e+10,USD
4,a01b8d46-d311-3333-7c34-aa3ae9c03f22,Mark Zuckerberg,person,investor,"investment_partner,angel",8.0,,,
...,...,...,...,...,...,...,...,...,...
95,8aaa422d-b364-06de-2952-4b302005b4e5,Michael Seibel,person,investor,"investment_partner,angel",31.0,,,
96,bec07f9a-33de-746e-e380-cadbb4be3a29,Emmett Shear,person,investor,angel,20.0,,,
97,e1274cad-2b94-0ce0-e827-a6d736bbad95,Kyle Vogt,person,investor,angel,26.0,,,
98,4ec1b17e-9816-3bbb-36e6-c282e3a029fe,Brad Hunstable,person,investor,angel,1.0,,,


### Investments_df

In [30]:
investments_df = investments_df.drop(columns=['permalink','cb_url','rank','created_at',
                                              'updated_at'])

In [33]:
investments_df.columns

Index(['uuid', 'name', 'type', 'funding_round_uuid', 'funding_round_name',
       'investor_uuid', 'investor_name', 'investor_type', 'is_lead_investor'],
      dtype='object')

In [31]:
investments_df = investments_df.iloc[:100]

In [32]:
investments_df

Unnamed: 0,uuid,name,type,funding_round_uuid,funding_round_name,investor_uuid,investor_name,investor_type,is_lead_investor
0,524986f0-3049-54a4-fa72-f60897a5e61d,Accel investment in Series A - Facebook,investment,d950d7a5-79ff-fb93-ca87-13386b0e2feb,Series A - Facebook,b08efc27-da40-505a-6f9d-c9e14247bf36,Accel,organization,True
1,6556ab92-6465-25aa-1ffc-7f8b4b09a476,Greylock investment in Series B - Facebook,investment,6fae3958-a001-27c0-fb7e-666266aedd78,Series B - Facebook,e2006571-6b7a-e477-002a-f7014f48a7e3,Greylock,organization,True
2,0216e06a-61f8-9cf1-19ba-20811229c53e,Meritech Capital Partners investment in Series...,investment,6fae3958-a001-27c0-fb7e-666266aedd78,Series B - Facebook,8d5c7e48-82da-3025-dd46-346a31bab86f,Meritech Capital Partners,organization,True
3,dadd7d86-520d-5e35-3033-fc1d8792ab91,Trinity Ventures investment in Series B - Phot...,investment,bcd5a63d-ed99-6963-0dd2-e36f6582f846,Series B - Photobucket,7ca12f7a-2f8e-48b4-a8d1-1a33a0e275b9,Trinity Ventures,organization,
4,581c4b38-9653-7117-9bd4-7ffe5c7eba69,Founders Fund investment in Series A - Geni,investment,60e6afd9-1215-465a-dd17-0ed600d4e29b,Series A - Geni,fb2f8884-ec07-895a-48d7-d9a9d4d7175c,Founders Fund,organization,True
...,...,...,...,...,...,...,...,...,...
95,eed18456-ea2a-bcd8-9bfb-4758018e3d37,ZG Ventures investment in Series B - iSkoot,investment,30caabd3-df0a-2123-2475-1d8e4b3047fd,Series B - iSkoot,a0a1490e-9ad3-6f89-ab4a-92fe8fd24022,ZG Ventures,organization,False
96,95171e50-1395-f481-86a0-1fd2e59c51f6,Khosla Ventures investment in Series B - iSkoot,investment,30caabd3-df0a-2123-2475-1d8e4b3047fd,Series B - iSkoot,fe5a4983-a46a-2fc2-5633-e35e0a86b694,Khosla Ventures,organization,False
97,825af9dd-1574-7990-81a7-81336aefa1b8,Jesselson Capital investment in Series B - iSkoot,investment,30caabd3-df0a-2123-2475-1d8e4b3047fd,Series B - iSkoot,8882f54b-0f22-0bdc-3f04-22d688445c55,Jesselson Capital,organization,False
98,b35c186f-7862-5ee8-660b-789338d84c3d,ZG Ventures investment in Series A - iSkoot,investment,b22a677e-4530-7d0a-d7b5-aed04393a2f4,Series A - iSkoot,a0a1490e-9ad3-6f89-ab4a-92fe8fd24022,ZG Ventures,organization,False


### Funding_rounds_df

In [15]:
funding_rounds_df = funding_rounds_df.drop(columns=['permalink','cb_url','rank','created_at',
                                                    'updated_at','country_code','state_code',
                                                    'region','city','announced_on','post_money_valuation',
                                                    'post_money_valuation_currency_code'])

In [36]:
funding_rounds_df = funding_rounds_df.iloc[:100]

In [37]:
funding_rounds_df

Unnamed: 0,uuid,name,type,permalink,cb_url,rank,created_at,updated_at,country_code,state_code,...,raised_amount_usd,raised_amount,raised_amount_currency_code,post_money_valuation_usd,post_money_valuation,post_money_valuation_currency_code,investor_count,org_uuid,org_name,lead_investor_uuids
0,8a945939-18e0-cc9d-27b9-bf33817b2818,Angel Round - Facebook,funding_round,facebook-angel--8a945939,https://www.crunchbase.com/funding_round/faceb...,132354.0,2007-05-27 06:08:18,2018-02-12 23:05:39,USA,CA,...,500000.0,500000.0,USD,,,,4.0,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,3f47be49-2e32-8118-01a0-31685a4d0fd7
1,d950d7a5-79ff-fb93-ca87-13386b0e2feb,Series A - Facebook,funding_round,facebook-series-a--d950d7a5,https://www.crunchbase.com/funding_round/faceb...,199624.0,2007-05-27 06:09:10,2018-02-12 23:52:16,USA,CA,...,12700000.0,12700000.0,USD,98000000.0,98000000.0,USD,4.0,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,b08efc27-da40-505a-6f9d-c9e14247bf36
2,6fae3958-a001-27c0-fb7e-666266aedd78,Series B - Facebook,funding_round,facebook-series-b--6fae3958,https://www.crunchbase.com/funding_round/faceb...,190559.0,2007-05-27 06:09:36,2018-02-12 23:30:46,USA,CA,...,27500000.0,27500000.0,USD,502500000.0,502500000.0,USD,5.0,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,"e2006571-6b7a-e477-002a-f7014f48a7e3,8d5c7e48-..."
3,bcd5a63d-ed99-6963-0dd2-e36f6582f846,Series B - Photobucket,funding_round,photobucket-series-b--bcd5a63d,https://www.crunchbase.com/funding_round/photo...,269931.0,2007-05-29 11:05:59,2018-02-12 23:27:36,USA,CO,...,10500000.0,10500000.0,USD,,,,2.0,f53cb4de-236e-0b1b-dee8-7104a8b018f9,Photobucket,
4,60e6afd9-1215-465a-dd17-0ed600d4e29b,Series A - Geni,funding_round,geni-series-a--60e6afd9,https://www.crunchbase.com/funding_round/geni-...,375185.0,2007-05-31 20:19:28,2018-02-12 23:41:29,USA,CA,...,,,,10000000.0,10000000.0,USD,1.0,4111dc8b-c0df-2d24-ed33-30cd137b3098,Geni,fb2f8884-ec07-895a-48d7-d9a9d4d7175c
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,36043f36-ecaa-1f97-b11c-a61988e2d61b,Series A - Pageflakes,funding_round,pageflakes-series-a--36043f36,https://www.crunchbase.com/funding_round/pagef...,133411.0,2007-07-11 11:44:37,2018-02-12 23:33:14,USA,CA,...,1300000.0,1300000.0,USD,,,,1.0,6293e606-09d2-e1d1-2de7-7072b76cafeb,Pageflakes,d0cdfdc0-517d-ce18-4a74-194f506bccad
96,3eedda71-b681-f6a9-cf96-270964889971,Seed Round - Haute Secure,funding_round,hautesecure-seed--3eedda71,https://www.crunchbase.com/funding_round/haute...,321122.0,2007-07-11 15:58:31,2018-02-12 23:34:32,USA,WA,...,500000.0,500000.0,USD,,,,1.0,bca6051d-bfab-c24b-bc0b-9ecaabf35feb,Haute Secure,
97,fd9d16df-6e46-c84c-5a34-c52f0c5c6713,Seed Round - Netvibes,funding_round,netvibes-seed--fd9d16df,https://www.crunchbase.com/funding_round/netvi...,376599.0,2007-07-11 16:28:29,2018-02-12 23:41:45,USA,CA,...,1000000.0,1000000.0,USD,,,,5.0,239683e5-4cbc-e4a7-8057-d8a6e5b90e52,Netvibes,
98,4e002acc-b124-4ab0-8bbc-cfeacb90c9e7,Series A - Netvibes,funding_round,netvibes-series-a--4e002acc,https://www.crunchbase.com/funding_round/netvi...,207855.0,2007-07-11 16:32:43,2018-02-12 23:33:39,USA,CA,...,15000000.0,15000000.0,USD,,,,2.0,239683e5-4cbc-e4a7-8057-d8a6e5b90e52,Netvibes,b08efc27-da40-505a-6f9d-c9e14247bf36


### Degrees_df

In [173]:
degrees_df = degrees_df.drop(columns=['permalink','cb_url','rank','created_at',
                                      'updated_at','institution_uuid', 'institution_name','subject'])

KeyError: "['permalink' 'cb_url' 'rank' 'created_at' 'updated_at'] not found in axis"

In [174]:
degrees_df

Unnamed: 0,uuid,name,type,person_uuid,person_name,institution_uuid,institution_name,degree_type,subject,started_on,completed_on,is_completed
0,205fdfd1-ecac-aa43-262f-219f11755f67,MS Mass Communication @ Boston University,degree,4897dba9-3141-ecc0-2c4b-c9d844e6440f,John Green,1eab62d2-15d9-0db7-930f-2aa77d4688e1,Boston University,MS,Mass Communication,,1992-01-01,True
1,1a2ac288-eb99-3318-fde5-1517bc168f51,"BA English, French @ Washington University in...",degree,4897dba9-3141-ecc0-2c4b-c9d844e6440f,John Green,6ae9957a-8fb4-0ab1-73fa-dd547c4d3da4,Washington University in St. Louis,BA,"English, French",,1990-01-01,True
2,b978d338-7ccc-7469-5ce7-ef98c34155ad,MS Internet Technology @ University of Greenwich,degree,7d187b77-94f7-e6cc-6981-d7468db5968f,Sridhar Gundaiah,b5ea73f6-12a3-576d-ae9b-f4169147f974,University of Greenwich,MS,Internet Technology,,2006-01-01,True
3,d01cfaad-0ccc-631e-0382-1e507bf6ba04,BS Computer Science @ Northeastern University,degree,45e29113-19f9-c828-68d0-42ad2ef04da0,John Furrier,5f84d9fa-78fd-14b0-d335-5ac3561cdc0a,Northeastern University,BS,Computer Science,,1988-01-01,True
4,16a607f5-8ab3-573e-8d09-57b3e352e1d3,BS Computer Science @ Rochester Institute of ...,degree,b3b4ec6b-86fe-dd6e-ec8a-e8e7e999f274,Ian Reardon,1a380593-fb9d-7f7b-ae67-784ce3ff3690,Rochester Institute of Technology,BS,Computer Science,,2002-01-01,True
...,...,...,...,...,...,...,...,...,...,...,...,...
407471,82acebc9-9e5d-4ea6-8af5-76bf1dc84751,,degree,704d3912-f733-4460-97c1-5d2dbb6e4cbd,Vladislav Zdorenko,20135206-96eb-8be0-9ac4-670b257e532c,Stanford University,,VC Unlocked: Silicon Valley,,,False
407472,49db660c-a8e5-4f57-ac20-ea5d9b420b02,,degree,704d3912-f733-4460-97c1-5d2dbb6e4cbd,Vladislav Zdorenko,cc741439-a1aa-b938-3d7e-5b287ed0381e,Bauman Moscow State Technical University,,,,,False
407473,9cc520ae-be19-44e0-9756-7990985758d2,,degree,e68f561a-5ebd-49b1-b2d3-97d96264002a,Sergey Fedyushchenko,9193480c-c873-4e5c-9e5d-05c2a7665550,"Haas School of Business, University of Califor...",,,,,False
407474,1f4faf92-4da8-4d54-8442-91169523e45f,,degree,485a2d98-5be8-4903-bbb7-f32b566e483a,Carlos Tramutola,94a74e17-c5e4-f996-b2b0-4c9d4db492db,Stanford Graduate School of Business,MBA,,2000-09-01,2002-06-30,True


### Category_groups_df

In [18]:
category_groups_df = category_groups_df.drop(columns=['permalink','cb_url','rank','created_at','updated_at'])