In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import re # regex

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt # charting

from wordcloud import WordCloud # word cloud
from sklearn.feature_extraction.text import CountVectorizer # convert text to token counts

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# View Dataset

In [None]:
# Read and print first 5 rows of data
df = pd.read_csv('../input/startup_funding.csv')
df.head()

In [None]:
df.dtypes

In [None]:
# Set SNo as Index
df = df.set_index(df['SNo'])
df = df.drop('SNo', axis=1) # Drop SNo column
df.head()

In [None]:
# Convert Date to datetime format

# Running 
# df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
# gives ValueError: ('Unknown string format:', '12/05.2015')

# Look at the values with '.'
df['Date'].loc[df['Date'].str.contains('\.')]

In [None]:
# Replace '.' with '/'
df['Date'] = df['Date'].str.replace('\.', '/')

# Check if replaced, should return nothing
df['Date'].loc[df['Date'].str.contains('\.')]

In [None]:
# Running
# df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
# again gives ValueError: ('Unknown string format:', '22/01//2015')

# Look at the values with '.'
df['Date'].loc[df['Date'].str.contains('//.')]

In [None]:
# Replace '//' with '/'
df['Date'] = df['Date'].str.replace('//', '/')

# Check if replaced, should return nothing
df['Date'].loc[df['Date'].str.contains('//')]

In [None]:
# Convert date
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
df['Date'].dtype

In [None]:
# Convert AmountInUSD

# Remove commas
df['AmountInUSD'] = df['AmountInUSD'].str.replace(',', '')

df['AmountInUSD'] = df['AmountInUSD'].astype(float)
df['AmountInUSD'].dtype

In [None]:
# Double check all data types
df.dtypes

# EDA

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

## Funding by Date

- About 2 years 11 months worth of data
- Pretty consistent pattern (seasonality) - likely due to weekdays/weekends
- Overall, slightly lesser investments per day in 2017 as compared to 2015 and 2016
- Number of fundings are plateauing

In [None]:
print('The funding dates are between', df['Date'].min(), 'and', df['Date'].max())

df['Date'].value_counts().plot.line(color='navy')
plt.title('Number of Funding Across Time')
plt.xlabel('Date')
plt.ylabel('Time')
plt.show()

In [None]:
df['Date'].value_counts().sort_index().cumsum().plot.line(color='navy')
plt.title('Cumulative Funding Across Time')
plt.xlabel('Date')
plt.ylabel('Cumulative Funding')
plt.show()

## Funding by Startup

- Swiggy got the largest number of funding (does not equate to largest total - requires more analysis)
- On average, each startup got 1.185 fundings - not bad actually!

In [None]:
print('Top number of fundings')
df['StartupName'].value_counts().head()

In [None]:
print('On average, each startup got funded', df['StartupName'].value_counts().mean(), 'times')

## Industry Vertical

- 744 industry verticals
- Technology based solutions are the majority
- Words like "consumer", "internet", "technology" and "ecommerce" are trending

In [None]:
print('Number of unique Industry Verticals:', df['IndustryVertical'].nunique())

In [None]:
df['IndustryVertical'].value_counts()[:20]

In [None]:
df['IndustryVertical'].value_counts()[:20].plot.bar(color='navy')
plt.title('Top 20 Industry Verticals')
plt.xlabel('Industry Vertical')
plt.ylabel('Count')
plt.show()

In [None]:
wordcloud = WordCloud(collocations=False) # Disable collocations (bigrams) of two words
wordcloud.generate(df['IndustryVertical'].str.cat(sep=' '))
plt.imshow(wordcloud) # 'Plot' wordcloud
plt.axis('off') # Hide axes
plt.title('Single Word Trends')
plt.show()

In [None]:
wordcloud = WordCloud()
wordcloud.generate(df['IndustryVertical'].str.cat(sep=' '))
plt.imshow(wordcloud) # 'Plot' wordcloud
plt.axis('off') # Hide axes
plt.title('Single and Double Words Trends')
plt.show()

## Sub-Vertical Trends

- 1385 Sub-Verticals
- Trending words include "online", "solution", "app", "marketplace", "platform", "service", "mobile", "discovery"

In [None]:
print('Number of unique Sub Verticals:', df['SubVertical'].nunique())

In [None]:
df['SubVertical'].value_counts()[:20]

In [None]:
wordcloud = WordCloud(collocations=False) # Disable collocations (bigrams) of two words
wordcloud.generate(df['SubVertical'].str.cat(sep=' '))
plt.imshow(wordcloud) # 'Plot' wordcloud
plt.axis('off') # Hide axes
plt.title('Single Word Trends')
plt.show()

In [None]:
wordcloud = WordCloud()
wordcloud.generate(df['SubVertical'].str.cat(sep=' '))
plt.imshow(wordcloud) # 'Plot' wordcloud
plt.axis('off') # Hide axes
plt.title('Single and Double Words Trends')
plt.show()

## City Location

- Startup can be in multiple locations, separated by slashes
- Most startups are based in Indian cities

In [None]:
df['CityLocation'].nunique()

In [None]:
df['CityLocation'].value_counts().head(20)

In [None]:
df['CityLocation'].dropna().loc[df['CityLocation'].dropna().str.contains('/')][:5]

In [None]:
# Separate locations
# Drop nulls, split location and flatten 2D array to 1D array
city_locations = df['CityLocation'].dropna() \
                                   .map(lambda location: 
                                        [l.strip() for l in location.split('/')]) \
                                   .sum()
pd.Series(city_locations).value_counts()[:20]

In [None]:
pd.Series(city_locations).value_counts()[:20].plot.bar(color='navy')
plt.title('Top 20 Indian Startup Locations')
plt.xlabel('Country')
plt.ylabel('Count')
plt.show()

## Investors Name

- Investors' names are separated by commas
- Investment companies make more investments than individuals

In [None]:
df['InvestorsName'].unique()[:20]

In [None]:
# Separate investor names
# Drop nulls, split location and flatten 2D array to 1D array
investors_names = df['InvestorsName'].dropna() \
                                    .map(lambda investors: 
                                         [i.strip().title() for i in investors.split(',')]) \
                                    .sum()

# Remove empty names
pd.Series(investors_names) \
    .replace('', np.nan) \
    .dropna() \
    .value_counts()[:20]

In [None]:
# Remove empty names and plot
pd.Series(investors_names) \
    .replace('', np.nan) \
    .dropna() \
    .value_counts()[:20] \
    .plot.bar(color='navy')

plt.title('Top 20 Investors')
plt.xlabel('Investor Name')
plt.ylabel('Count')
plt.show()

## Investment Type

- There are only 4 investment types - standardisation has been done
- In this case, data standardisation can be done by removing spaces and making all letters lowercase
- Most investments are seed funding and private equity, whereas crowd funding and debt funding are extremely rare

In [None]:
df['InvestmentType'].unique()

In [None]:
# Remove spaces and make everything lower case
df['InvestmentType'] = df['InvestmentType'].str.replace(' ', '').str.lower()

df['InvestmentType'].unique()

In [None]:
df['InvestmentType'].value_counts()

In [None]:
df['InvestmentType'].value_counts().plot.bar(color='navy')
plt.title('Fundings by Investment Type')
plt.xlabel('Investment Type')
plt.ylabel('Count')
plt.show()

## Amount in USD

- 1525 investments
- Min USD 16000, max USD 1.4B
- Extremely right skewed distribution
- Startup names are not standardised (e.g. Ola, Olacabs and Ola Cabs are probably the same company)

In [None]:
df['AmountInUSD'].describe()

In [None]:
df['AmountInUSD'].plot.box(color='navy')
plt.show()

In [None]:
df['AmountInUSD'].plot.hist(color='navy')
plt.show()

In [None]:
print('Top Investments')
df.iloc[df['AmountInUSD'].nlargest(10).index]

# Questions

## How does the funding ecosystem change with time?

- Number of investors who invested in startups plummeted in 2017
  - Did India lose its attractiveness in the startup ecosystem?

In [None]:
# Number of investors over time
df.groupby(pd.Grouper(key='Date', freq='Q'))['InvestorsName'] \
    .agg(lambda investors_series: investors_series.tolist()) \
    .apply(lambda investors_list: ','.join(filter(lambda investors: type(investors) is str, investors_list))) \
    .apply(lambda investors: len(set([investor.strip().title() for investor in investors.split(',')])))

In [None]:
df.groupby(pd.Grouper(key='Date', freq='Q'))['InvestorsName'] \
    .agg(lambda investors_series: investors_series.tolist()) \
    .apply(lambda investors_list: ','.join(filter(lambda investors: type(investors) is str, investors_list))) \
    .apply(lambda investors: len(set([investor.strip().title() for investor in investors.split(',')]))) \
    .plot.line(color='navy')
plt.title('Estimated Number of Distinct Investors over Time')
plt.xlabel('Quarter')
plt.ylabel('Distinct Number of Investors')
plt.show()

## Do cities play a major role in funding?

- Yes, startups with a presence in the US and Bangalore markets get more funding on average


In [None]:
# Average funding per city

# Vectorize CityLocation, as one startup can have zero to many of them
vectorizer = CountVectorizer(tokenizer=(lambda locations: [location.strip() for location in locations.split('/')]), lowercase=False)

vectorized_df = pd.DataFrame(vectorizer.fit_transform(df['CityLocation'].replace(np.nan, 'Unknown')).toarray(), 
                             columns=vectorizer.get_feature_names())

vectorized_df.head()

In [None]:
# Join and randomly check for accuracy
joined_df = df[['CityLocation', 'AmountInUSD']].join(vectorized_df)
joined_df['SumCityLocation'] = joined_df[vectorizer.get_feature_names()].sum(axis=1)

joined_df[['CityLocation', 'SumCityLocation']].loc[joined_df['SumCityLocation'] > 1].sample(5)

In [None]:
average_funding_per_citylocation = \
    pd.Series([joined_df.loc[joined_df[citylocation] == 1]['AmountInUSD'].mean() for citylocation in vectorizer.get_feature_names()], 
              index=vectorizer.get_feature_names())

average_funding_per_citylocation.head()

In [None]:
# Double check NaN values
joined_df.loc[joined_df['Agra'] == 1]

In [None]:
plt.figure(figsize=(20, 7))
average_funding_per_citylocation.dropna() \
                                .sort_values(ascending=False) \
                                .plot.bar(color='navy')
plt.title('Average Funding per City/Location')
plt.xlabel('City/Location')
plt.ylabel('Average Funding (10 million USD)')
plt.show()

## Which industries are favored by investors for funding?

- Looking at all the industry verticals, it looks very segmented with over 700 unique values. I found out that filtering for non-null industry verticals results in cleaner industry vertical categories, but at the expense of not analysing data before 2016.
- eCommerce startups get more than double the average funding as compared to all other industries

In [None]:
# Average funding per industry

print('Data without SubVertical')
df.loc[df['SubVertical'].isna()]['Date'].describe()

In [None]:
print('Data with SubVertical')
df.loc[df['SubVertical'].isna() == False]['Date'].describe()

In [None]:
df.loc[df['SubVertical'].isna() == False]['IndustryVertical'].value_counts()

In [None]:
# Standardise industry verticals
industry_verticals = {
    'ECommerce': 'eCommerce',
    'Ecommerce': 'eCommerce',
    'Ecommerce': 'eCommerce',
    'ecommerce': 'eCommerce',
    'Food & Beverages': 'Food & Beverage',
    'healthcare': 'Healthcare',
    'Consumer Interne': 'Consumer Internet'
}

df.loc[df['SubVertical'].isna() == False]['IndustryVertical'].map(lambda iv: industry_verticals[iv] if iv in industry_verticals else iv).value_counts()

In [None]:
df.loc[df['SubVertical'].isna() == False] \
    .groupby(df['IndustryVertical'].map(lambda iv: 
                                        industry_verticals[iv] if iv in industry_verticals else iv))['AmountInUSD'] \
    .mean() \
    .sort_values(ascending=False) \
    .plot.bar(color='navy')
plt.title('Average Funding per Industry Vertical (from 2016 onward)')
plt.xlabel('Industry Vertical')
plt.ylabel('Average Funding (10 million USD)')
plt.show()

## Who are the important investors in the Indian Ecosystem?

- Investor names are not exactly the same, and standardising them individually can be a lot of work, so a general approach is done instead
- Most important investors include Sequoia Capital, Accel Partners, SAIF Partners, Kalaari Capital, Blume Ventures, Tiger Global, Indian Angel Network and Nexus Venture Partners

In [None]:
# Number of investments per investor

# Vectorize InvestorsName, as one startup can have zero to many of them
vectorizer = CountVectorizer(tokenizer=(lambda investors: [investor.strip() for investor in re.split('\.|,|&', investors)]), 
                             lowercase=False)

# Replace null values with 'Others', remove remarks enclosed in brackets (parentheses) and titlecase each investor
vectorized_df = pd.DataFrame(vectorizer.fit_transform(df['InvestorsName'].replace(np.nan, 'Undisclosed Investors')
                                                                         .map(lambda investor: 
                                                                              investor.split('(')[0]
                                                                              .strip().title()))
                                       .toarray(), 
                             index=df.index,
                             columns=vectorizer.get_feature_names())

vectorized_df.head()

In [None]:
joined_df = df.join(vectorized_df)
joined_df.head()

In [None]:
# Join and randomly check for accuracy
joined_df.loc[joined_df['Sequoia Capital'] == 1]['InvestorsName'].sample(5).tolist()

In [None]:
num_investments = list()

for investor in vectorizer.get_feature_names():
    num_investments.append(joined_df.loc[joined_df[investor] == 1]['AmountInUSD'].count())

num_investments = pd.Series(num_investments, index=vectorizer.get_feature_names())
num_investments = num_investments.drop('') \
                                 .loc[num_investments.isna() == False]
num_investments.head()

In [None]:
num_investments.sort_values(ascending=False)[:10]

In [None]:
plt.figure(figsize=(20, 5))
num_investments.sort_values(ascending=False)[:50].plot.bar(color='navy')
plt.title('Top 50 Investments per Investor')
plt.xlabel('Investor')
plt.ylabel('Number of Investments Made')
plt.show()

## How much funds does startups generally get in India?

- Even though the startup names are not exactly the same, standardising them can be a lot of work, so a general analysis is done instead
- On average, each startup gets 13.9 million USD, but the distribution is very right skewed with the median at 1 million USD
- Analysing the startups with less than 1 million USD (the left half), most of them has 500,000 USD or less

In [None]:
investments_by_startup = df.groupby('StartupName')['AmountInUSD'].sum()
investments_by_startup = investments_by_startup.loc[investments_by_startup > 0]
investments_by_startup.describe()

In [None]:
plt.figure(figsize=(20, 5))
investments_by_startup.plot.hist(bins=20, color='navy')
plt.title('Distribution of Total Investments per Startup')
plt.show()

In [None]:
std_dev = investments_by_startup.std()
mean = investments_by_startup.mean()

plt.figure(figsize=(20, 5))
investments_by_startup.loc[investments_by_startup <= 1000000] \
                      .plot.hist(bins=100, color='navy')
plt.title('Distribution of Total Investments per Startup (≤1 million USD)')
plt.show()

In [None]:
std_dev = investments_by_startup.std()
mean = investments_by_startup.mean()

plt.figure(figsize=(20, 5))
investments_by_startup.loc[investments_by_startup >= 1000000] \
                      .plot.hist(bins=100, color='navy')
plt.title('Distribution of Total Investments per Startup (≥1 million USD)')
plt.show()

# Self-Created Questions

## Who are the most active investors?
- Sequoia Capital, Accel Partners and Saif Partners are the most active, investing on an average of 14, 18 and 19 days apart respectively

In [None]:
# Average days between investments per investor

# Vectorize InvestorsName, as one startup can have zero to many of them
vectorizer = CountVectorizer(tokenizer=(lambda investors: [investor.strip() for investor in re.split('\.|,|&', investors)]), 
                             lowercase=False)

# Replace null values with 'Others', remove remarks enclosed in brackets (parentheses) and titlecase each investor
vectorized_df = pd.DataFrame(vectorizer.fit_transform(df['InvestorsName'].replace(np.nan, 'Undisclosed Investors')
                                                                         .map(lambda investor: 
                                                                              investor.split('(')[0]
                                                                              .strip().title()))
                                       .toarray(), 
                             index=df.index,
                             columns=vectorizer.get_feature_names())

vectorized_df.head()

In [None]:
joined_df = df.join(vectorized_df)
joined_df.head()

In [None]:
# Join and randomly check for accuracy
joined_df.loc[joined_df['Sequoia Capital'] == 1]['InvestorsName'].sample(5).tolist()

In [None]:
# Sample investors who made >=30 investments
sampled_investors = [investor for investor in vectorizer.get_feature_names() if joined_df[investor].sum() >= 30]
sampled_investors[:5]

In [None]:
average_days = list()

for investor in sampled_investors:
    date_series = joined_df.loc[joined_df[investor] == 1]['Date'].sort_values()
    average = (date_series - date_series.shift(1)).mean()
    average_days.append(average)
    
active_investors_series = pd.Series(average_days, index=sampled_investors)
active_investors_series.head()

In [None]:
active_investors_series.sort_values()

In [None]:
active_investors_series.sort_values().plot.bar(color='navy')
plt.show()