In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Startup Investments
Venture deals, organizations, people and exits.


# Exploratory Data Analysis

__Description__

The startup world is flourishing. There are hundreds of new companies being founded each day and venture capital has become a substantial asset class with yearly investments exceeding $100B in the US alone. Crunchbase 2013 Snapshot © 2013 dataset provides a glimpse into this exciting world.


__Content__

This diverse dataset contains information about the startup ecosystem: organizations, individuals, company news, funding rounds, acquisitions, and IPOs.
There are 11 tables that can be joined using unique IDs (schema to follow). More information about the individual data variables can be found on the Crunchabse Data website (under the API Entities Types section).


__Acknowledgements__

This Crunchbase 2013 Snapshot © 2013 dataset is fully attributed to Crunchbase.

__Inspiration__
There are multiple avenues for exploration:

1) EDA of the startup ecosystem.<br>
2) Tracking and analyzing investment trends over time.<br>
3) Clustering VC funds based on their existing investments.<br>
4) Predicting which startup will proceed to raise further rounds will get acquired will file for an IPO.<br>
5) Mapping the network of individuals involved in the startup ecosystem.


# Details about each csv file.

ACQUISITIONS.CSV<br>
Contains information about the startups that have been bought.

DEGREE.CSV<br>
Contains education background of the individuals involved in the startup world.

FUNDING ROUNDS.CSV<br>
CONTAIN INFORMATION ABOUT STARTUP FUNDING ROUNDS.

FUNDS.CSV<br>
Contains data on the ventures capital funds that make investments.

INVESTMENT.CSV<br>
Contains data on the type of various different investments made by venture capitalists.

IPOs.CSV<br>
Contsins data on initial public offerings.

MILESTONES.CSV<br>
Contains Significan events within the startup ecosystem

OBJECT.CSV<br>
Main file containing base information

OFFICE.CSV<br>
Contains information about startup company officers

PEOPLE.CSV<br>
Contains information of people in the startup world

RELATIONSHIP.CSV<br>
Contains relationship data that links companies to individual and their position

In [None]:
# Importing libraries for plotting

import matplotlib.pyplot as plt
%matplotlib inline 

import seaborn as sns
sns.set(color_codes=True)
sns.set(style='white')

In [None]:
#Let us deal with the objects.csv file
df_objects = pd.read_csv("../input/startup-investments/objects.csv")

In [None]:
df_objects.head(5)

In [None]:
df_objects.dtypes

In [None]:
# So we have object,int and flaot datatypes

df_objects.describe(include='all')

* 	We have details of about 462651 funding deals in this data


In [None]:
df_objects.columns

In [None]:
df_objects['entity_type'].value_counts()

In [None]:
df_objects['status'].value_counts()

In [None]:
# Rename id in objects.csv to funded_object_id
df_objects.rename(columns={'id':'funded_object_id'}, inplace=True)
df_objects.head()

In [None]:
# Printing the Information regarding dataset

print("Info",df_objects.info())

# No of rows and columns in Dataset
print('\n\nThe dataset contains {} rows and {} columns'.format(df_objects.shape[0],df_objects.shape[1]))


In [None]:

# check for missing data in the  data
df_objects.isnull().sum()

In [None]:
#lets check percentange of missing value
missing_value_train=pd.DataFrame(data=df_objects.isnull().sum()/len(df_objects)*100,index=df_objects.columns,columns=['missing%'])
missing_value_train.sort_values(by='missing%',ascending=False, inplace=True)
missing_value_train[missing_value_train['missing%'] >0]

In [None]:
sns.heatmap(df_objects.isnull(),yticklabels=False,cbar=False)

In [None]:
# Dropping null values
df_objects.drop(["created_at","updated_at", "logo_url", "logo_width","overview", "entity_id","parent_id","normalized_name", "logo_height","short_description", "created_at", "updated_at", "twitter_username","relationships", "domain", "homepage_url", "overview", "tag_list","city", "region", "state_code"], axis="columns", inplace=True)
df_objects.info()

In [None]:
df_objects['category_code'].value_counts()

In [None]:
df_objects['country_code'].value_counts()

In [None]:
df_objects['status'].value_counts()

So these are the categories of startups.

In [None]:
df_objects['country_code'].value_counts()

In [None]:
df_objects.info()

In [None]:
investments = pd.read_csv("../input/startup-investments/investments.csv")
investments.head()

In [None]:
investments['funded_object_id'].value_counts()

In [None]:
# Loading and merging the required dataset

df = investments.merge(df_objects, on='funded_object_id')
df.head()

In [None]:
df.info()

In [None]:
df['status'].value_counts()

In [None]:
df.drop(["closed_at", "first_investment_at","invested_companies", "investment_rounds", "created_at", "updated_at"], axis="columns", inplace= True)

In [None]:
df.tail()

In [None]:
funding_rounds = pd.read_csv("../input/startup-investments/funding_rounds.csv")
funding_rounds.head()

In [None]:
funding_rounds.info()

In [None]:
funding_rounds.drop(['id', 'funding_round_id', 'funding_round_code', 'raised_amount', 'raised_currency_code', 'pre_money_valuation_usd', 'pre_money_valuation', 'pre_money_currency_code', 'post_money_valuation_usd', 'post_money_currency_code', 'participants', 'is_first_round', 'is_last_round', 'source_url', 'source_description', 'created_by', 'updated_at', 'created_at'], axis='columns', inplace=True)

In [None]:
funding_rounds.drop(['post_money_valuation'], axis='columns', inplace=True)

In [None]:
funding_rounds.head()

In [None]:
funding_rounds.rename(columns={'object_id':'funded_object_id'}, inplace=True)
funding_rounds.head()

In [None]:
df_new = df.merge(funding_rounds, on='funded_object_id')
df_new.head()

In [None]:
df_new.info()

In [None]:
# Length of new dataframe

len(df_new)

In [None]:
# Checking for null values in  new dataframe created

df_new.isnull().sum()

In [None]:
# Check in percentage the missing data
# summing up the missing values (column-wise) and displaying fraction of NaNs
round(100*(df_new.isnull().sum()/len(df_new.index)), 2)

In [None]:
df_new.drop(['created_by', 'first_milestone_at', 'last_milestone_at', 'last_investment_at'], axis='columns', inplace=True)

In [None]:
df_new.head()

In [None]:
df_new['category_code'].value_counts()

In [None]:
# Check in percentage the missing data
# summing up the missing values (column-wise) and displaying fraction of NaNs
round(100*(df_new.isnull().sum()/len(df_new.index)), 2)

In [None]:
#Dropping rows based on null columns
df_new = df_new[~(df_new['country_code'].isnull() | df_new['description'].isnull() | df_new['funded_at'].isnull() | df_new['founded_at'].isnull())]

In [None]:
df_new.isna().sum()

In [None]:
# Removing null values

df_new = df_new[~(df_new['category_code'].isnull())]

In [None]:
df_new.isna().sum()

In [None]:
df_new['status'].value_counts()

In [None]:
#Identify duplicates records in the data
dupes=df_new.duplicated()
sum(dupes)

In [None]:
df_new =df_new.drop_duplicates()

In [None]:
#Identify duplicates records in the data
dupes2=df_new.duplicated()
sum(dupes2)

In [None]:
# EDA of acquisitions.csv

acquisitions = pd.read_csv("../input/startup-investments/acquisitions.csv")
acquisitions.head()

In [None]:
acquisitions.info()

In [None]:
acquisitions['acquiring_object_id'].value_counts()

In [None]:
len(acquisitions)

In [None]:
acquisitions.isnull().sum()

In [None]:
# Check in percentage the missing data
# summing up the missing values (column-wise) and displaying fraction of NaNs
round(100*(acquisitions.isnull().sum()/len(acquisitions.index)), 2)

In [None]:
acquisitions.drop(['term_code', 'source_url', 'source_description'], axis='columns', inplace=True)

In [None]:
acquisitions.head()

In [None]:
acquisitions.isnull().sum()

In [None]:
# Dropping rows based on null columns
acquisitions = acquisitions[~(acquisitions['acquired_at'].isnull() | acquisitions['price_currency_code'].isnull() | acquisitions['price_currency_code'].isnull() | acquisitions['acquired_object_id'].isnull())]

In [None]:
acquisitions.isnull().sum()

So now we don't have any null values.

In [None]:
#Identify duplicates records in the data
dupes=acquisitions.duplicated()
sum(dupes)

So we have no duplicate values.

In [None]:
# EDA of funds.csv

funds =  pd.read_csv("../input/startup-investments/funds.csv")
funds.head()

In [None]:
funds.head()

In [None]:
funds.info()

In [None]:
len(funds)

In [None]:
funds.isnull().sum()

In [None]:
# Check in percentage the missing data
# summing up the missing values (column-wise) and displaying fraction of NaNs
round(100*(funds.isnull().sum()/len(funds.index)), 2)

In [None]:
funds.drop(['source_description', 'source_url', 'funded_at'], axis='columns', inplace=True)

In [None]:
funds.head()

In [None]:
# Check in percentage the missing data
# summing up the missing values (column-wise) and displaying fraction of NaNs
round(100*(funds.isnull().sum()/len(funds.index)), 2)

So now we don't have any null values.

In [None]:
#Identify duplicates records in the data
dupes=funds.duplicated()
sum(dupes)

So we don't have any duplicate values.

In [None]:
# Funds analysis
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(funds[:4]['name'],funds[:4]['raised_amount'])
plt.xlabel('Funds')
plt.ylabel('Amount Raised')
plt.show()

In [None]:
#EDA of Degree.csv

degrees =  pd.read_csv("../input/startup-investments/degrees.csv")
degrees.head()

This Data gives an idea about the Qualification of the People who gather the courage to start a startup.

In [None]:
degrees.info()

In [None]:
degrees.tail()

In [None]:
degrees.describe(include='all')

In [None]:
# So we have 109610.00000 values in id

In [None]:
degrees.isnull().sum()

So we have a lot of Null values.

In [None]:
len(degrees)

In [None]:
# Check in percentage the missing data
# summing up the missing values (column-wise) and displaying fraction of NaNs
round(100*(degrees.isnull().sum()/len(degrees.index)), 2)

In [None]:
#Dropping rows based on null columns
degrees = degrees[~(degrees['degree_type'].isnull() | degrees['subject'].isnull() | degrees['institution'].isnull() | degrees['graduated_at'].isnull())]

In [None]:
degrees.isnull().sum()

So now we don't have any null values.

In [None]:
dupes = degrees.duplicated()
sum(dupes)

Some Analysis of the Data.

In [None]:
# Pie chart, where the slices will be ordered and plotted anti-clockwise:

plt.rcParams['figure.figsize'] = 10,10
labels = df_new['status'].value_counts().index.tolist()
sizes = df_new['status'].value_counts().tolist()
explode = (0, 0.2, 0, 0)

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',shadow=True, startangle=90)
ax1.axis('equal')
plt.title("What is start up companies current status", fontdict=None, position= [0.48,1], size = 'x-large')

plt.show()

Most of company (78.5 %) in this dataset is operating,

and around 3.0 % company is already closed.

In [None]:
len(df_new['category_code'].unique())

In [None]:
df_new['category_code'].value_counts()[:5]

So we have These major categories.

Now  we have around 43 categories of start up.

In [None]:
plt.rcParams['figure.figsize'] = 15,8

height = df_new['category_code'].value_counts()[:45].tolist()
bars =  df_new['category_code'].value_counts()[:45].index.tolist()
y_pos = np.arange(len(bars))
plt.bar(y_pos, height , width=0.7 ,color= ['c']+['paleturquoise']*14)
plt.xticks(y_pos, bars)
plt.xticks(rotation=90)
plt.title("All Start-Up market category", fontdict=None, position= [0.48,1.05], size = 'x-large')
plt.show()

Now let us look at the top 15 startup category market.

In [None]:
plt.rcParams['figure.figsize'] = 15,8

height = df_new['category_code'].value_counts()[:15].tolist()
bars =  df_new['category_code'].value_counts()[:15].index.tolist()
y_pos = np.arange(len(bars))
plt.bar(y_pos, height , width=0.7 ,color= ['c']+['paleturquoise']*14)
plt.xticks(y_pos, bars)
plt.xticks(rotation=90)
plt.title("Top 15 Start-Up market category", fontdict=None, position= [0.48,1.05], size = 'x-large')
plt.show()

Distribution of Funding.

In [None]:
Q1 = df_new['raised_amount_usd'].quantile(0.25)
Q3 = df_new['raised_amount_usd'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = (Q1 - 1.5 * IQR)
upper_bound = (Q3 + 1.5 * IQR)

In [None]:
without_outlier = df_new[(df_new['raised_amount_usd'] > lower_bound ) & (df_new['funding_total_usd'] < upper_bound)]

plt.rcParams['figure.figsize'] = 15,6
plt.hist(without_outlier['raised_amount_usd'].dropna(), bins=30,color = 'paleturquoise' )

plt.ylabel('Count')
plt.xlabel('Funding (usd)')
plt.title("Distribution of total funding ", fontdict=None, position= [0.48,1.05], size = 'x-large')
plt.show()

Funds Distribution according to year.

In [None]:
df_new['founded_at'] = pd.to_datetime(df_new['founded_at'], errors = 'coerce' )

In [None]:
df_new['name'].groupby(df_new["founded_at"].dt.year).count().plot(kind="line")

plt.ylabel('Count')
plt.title("Founded distribution ", fontdict=None, position= [0.48,1.05], size = 'x-large')
plt.show()


In [None]:
#EDA of ipos.csv ,people.csv ,milestones.csv, offices.csv, relationships.csv, 

ipos =  pd.read_csv("../input/startup-investments/ipos.csv")
ipos.head()

people =  pd.read_csv("../input/startup-investments/people.csv")
people.head()


milestones =  pd.read_csv("../input/startup-investments/milestones.csv")
milestones.head()

offices =  pd.read_csv("../input/startup-investments/offices.csv")
offices.head()


relationships=  pd.read_csv("../input/startup-investments/relationships.csv")
relationships.head()






In [None]:
ipos.head()

In [None]:
ipos.info()

In [None]:
ipos.isnull().sum()

In [None]:
#Dropping rows based on null columns

ipos = ipos[~(ipos['raised_currency_code'].isnull() | ipos['object_id'].isnull() | ipos['valuation_currency_code'].isnull() | ipos['public_at'].isnull() | ipos['source_url'].isnull()| ipos['source_description'].isnull())]

In [None]:
ipos.isnull().sum()

In [None]:
dupes = ipos.duplicated()
sum(dupes)

In [None]:
people.head()

In [None]:
people.info()

In [None]:
len(people)

In [None]:
people.isnull().sum()

In [None]:
#Dropping rows based on null columns

people = people[~(people['first_name'].isnull() | people['last_name'].isnull() | people['birthplace'].isnull() | people['affiliation_name'].isnull())]

In [None]:
people.isnull().sum()

In [None]:
milestones.head()

In [None]:
milestones.info()

In [None]:
milestones.isnull().sum()

In [None]:
len(milestones)

In [None]:
#Dropping rows based on null columns

milestones = milestones[~(milestones['source_url'].isnull() | milestones['source_description'].isnull())]

In [None]:
milestones.isnull().sum()

In [None]:
milestones.describe()

In [None]:
offices.head()

In [None]:
offices.info()

In [None]:
offices.describe()

In [None]:
offices.isnull().sum()

In [None]:
offices.corr()

In [None]:
#Dropping rows based on null columns

offices = offices[~(offices['description'].isnull() | offices['address1'].isnull() | offices['address2'].isnull() | offices['city'].isnull()| offices['zip_code'].isnull()| offices['state_code'].isnull()| offices['created_at'].isnull()| offices['updated_at'].isnull())]

In [None]:
offices.isnull().sum()

In [None]:
relationships.head()

In [None]:
relationships.info()

In [None]:
relationships.describe()

In [None]:
relationships.isnull().sum()

In [None]:
len(relationships)

In [None]:
#Dropping rows based on null columns

relationships = relationships[~(relationships['start_at'].isnull() | relationships['end_at'].isnull() | relationships['title'].isnull())]

In [None]:
relationships.isnull().sum()

In [None]:
len(relationships)

In [None]:
# Check in percentage the missing data
# summing up the missing values (column-wise) and displaying fraction of NaNs
round(100*(relationships.isnull().sum()/len(relationships.index)), 2)