In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid", color_codes=True)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

## What does the Dataset even have ??
* Read the dataset
* Parse the date colums as datetime type
* Setup index column as SNo


In [None]:
df = pd.read_csv('../input/startup_funding.csv', parse_dates=['Date'], index_col='SNo')
df.AmountInUSD = df.AmountInUSD.str.replace(',','').astype('float32')
df.IndustryVertical = df.IndustryVertical.replace('eCommerce', 'ECommerce')
df.InvestmentType = df.InvestmentType.replace('Crowd funding', 'Crowd Funding')
df.InvestmentType = df.InvestmentType.replace('SeedFunding', 'Seed Funding')
df.InvestmentType = df.InvestmentType.replace('PrivateEquity', 'Private Equity')
df.Date = df.Date.str.replace('12/05.2015', '12/05/2015')
df.Date = df.Date.str.replace('13/04.2015', '13/04/2015')
df.Date = df.Date.str.replace('15/01.2015', '15/01/2015')
df.Date = df.Date.str.replace('22/01//2015', '22/01/2015')
df.Date = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df.head()

## What can we understand from this Dataset with a Bird's eye View ??
* There are 371 entries where startups have the same name, need to have a more detailed look to verify if the records are  repeated or the startups happen to have the same name :D
* There are missing Values in most of the columns, prominant ones are
    * Sub Vertical
    * Remarks
    * Amount
* "**Seed Funding**" seems to be the most common  form in this dataset.
* "**Bangalore**" seems to be the goto location for the startups seeking investment in this dataset.
* "**Consumer Internet**" seems to be the most popular industry vertical in the dataset.
* "**Online Pharmacy**" seems to be the most popular sub vertical in the dataset, this i did not see coming.

In [None]:
df.describe(include='all').loc[['count', 'unique', 'top']]

## Well they say money speaks, what can we see in the startup space ?
* Avg Funding Amount is 12031077 or **12 million USD** in the following dataset.
* There can be a huge difference in the amount of funding which varies from startup to startup which can be expected nevertheless.
    * The minimum amount is **16000 USD**
    * The maximum amount is 1400000000 USD or **1400 million USD**

In [None]:
df.AmountInUSD.describe()

## 14000 million USD??? Wait What !!!

* I had to verify which startups got this kind of investment, and now all the discount offers makes sense.

In [None]:
df[df.AmountInUSD == 1400000000]

## Just a min, max on the investment money does not explain much, lets see how it is distributed in general

Well here is the distribution of the Investment amount in the following dataset.

The line chart represents the density of the startups falling under that category (in bins of 20 as a histogram - dark blue is the records, light blue is the bin), The bar chart represents the frequency.

#### Plotting
The two calls for distplot is to obtain an [unormalised graph with the raw count](https://github.com/mwaskom/seaborn/issues/479). A single call without the axis changes would plot a normalised score for the same.

#### Analysis
* Most of the startups lie on the lower end of the scale, which is expected as the scale of the amount is skewed because of entities like Flipkart and Paytm

In [None]:
plt.figure(figsize=(15,10))
# Plotting hist without kde
ax = sns.distplot(df.AmountInUSD.dropna(), kde=False)

# Creating another Y axis
second_ax = ax.twinx()

#Plotting kde without hist on the second Y axis
fig = sns.distplot(df.AmountInUSD.dropna(), ax=second_ax, kde=True, hist=True, bins=20)
fig.get_xaxis().get_major_formatter().set_scientific(False)
fig.get_yaxis().get_major_formatter().set_scientific(False)
#Removing Y ticks from the second axis
second_ax.set_yticks([])
fig

## What are the Top 20 Verticals where invesment is being pumped in ?

* "**ECommerce**" seems to be on the top of the list closesly backed up with "**Consumer Internet**" , domains which usually require larger investment nevertheless. It would be interesting to see the number of startups contributing to the same in each vertical, because a few giants here can definitely skew the graph.
* The graph definitely represents what atleast the traditional consumer would be accustomed to as the top 20 verticals - Ex [Cab Aggregators, Online Food Delivry, Online Marketplace, Car Rental, Music Streaming App ..]
* Another trend is that a lot of these verticals are B2C model, B2B model exists in Logistics, Technology, Mobile Advertisement Platform.


In [None]:
plt.figure(figsize=(15,10))
total_investment_per_industry = df.pivot_table(index='IndustryVertical', values=['AmountInUSD'], aggfunc=np.sum, fill_value=0)
top_20_total_investment_per_industry = total_investment_per_industry.sort_values(by=['AmountInUSD'], ascending=False).head(20)
fig = sns.barplot(y=top_20_total_investment_per_industry.index, x=top_20_total_investment_per_industry['AmountInUSD'], orient='h')
fig.get_xaxis().get_major_formatter().set_scientific(False)
fig.set_title('Top 20 Verticals with their Total Investments')
for p in fig.patches:
    fig.annotate("{} million $".format(p.get_width() / 1000000.0), (p.get_width() * 1.05, p.get_y()))

## What are the Bottom 20 Verticals where invesment is not being pumped in ?
* Seems like it is composed of few new startups which were trying to go ahead with the same model as other existing startups, this shows saturation in the vertical, or this could also be a preliminary investment for the same startups Ex: Food Ordering & Delivery App, Coupon Aggregators, Cab Sharing Service

In [None]:
plt.figure(figsize=(10,10))
bottom_20_total_investment_per_industry = total_investment_per_industry[total_investment_per_industry > 0].sort_values(by=['AmountInUSD'], ascending=True).head(20)
fig = sns.barplot(y=bottom_20_total_investment_per_industry.index, x=bottom_20_total_investment_per_industry['AmountInUSD'], orient='h')
fig.set_title('Bottom 20 Total Investments based on Verticals')

## What kind of Startups under the Top 20 Verticals are getting investment??
* What is interesting is that many of these startups, revolve around **B2C **with a intent for providing the customer with services for **Food, Entertainment, Shopping, Travel**.
* Under the **B2B** Model with **Technology** as the focus, **Data Analytics Services** seems to be a new entry.
* **Educational Video Content Creator** seems to have also made the list.
* **Online Grocery Shopping** also seems to have made the list, well people have got to eat, atleast towards the end of the month ( When point 1 is on the down low for reasons. )

In [None]:
total_investment_per_industry_per_vert = df.pivot_table(index=['IndustryVertical', 'SubVertical', 'StartupName'], values=['AmountInUSD'], aggfunc=np.sum, fill_value=0)
total_investment_per_industry_per_vert.sort_values(by=['AmountInUSD'], ascending=False).head(20)

## Well i'm sure just because few startups in a certain vertical were very successful, does not help us conclude whether the vertical is in general getting good investment?
*  Well seems like the question is right, considering the discrepencies between the min, max is extreme for all the top verticals. It is certain there are a lot more factors affecting this, it would be interesting to try to understand what factors can contribute how much though.
* **Logistics, Finance, Online Food Delivery** seem to be closer to their avg investment, either showing stability in the vertical or more in lines of not too much of a distinguising factor for the startups here.

In [None]:
common_industry_verticals = df.dropna(subset=['AmountInUSD']).groupby('IndustryVertical').agg(['min', 'max', 'mean', 'count']).sort_values([('AmountInUSD', 'count')], ascending=False)['AmountInUSD']
top_10_common_industry_verticals = common_industry_verticals.head(10)
top_10_common_industry_verticals

In [None]:
# g = sns.FacetGrid(top_10_common_industry_verticals, col='industry')
# g.map(sns.barplot, "count", "mean")

# top_10_common_industry_verticals.loc[top_10_common_industry_verticals.index, 'Industry'] = top_10_common_industry_verticals.index.tolist()
# top_10_common_industry_verticals_plot = pd.melt(top_10_common_industry_verticals, id_vars=['Industry'], var_name="Metric Type", value_name='Value')
# g = sns.FacetGrid(top_10_common_industry_verticals_plot, col='Industry', col_wrap=4, sharex=False, row_order=['count', 'min', 'median', 'max'])
# g.map(sns.barplot, 'Metric Type', 'Value')

## What kind of investment are the investors usually providing ?
* Seems like **Private Equity and Debt Funding** is their GOTO.
* Private Equity though is a defacto as there is a huge difference between that and the next largest also.

In [None]:
plt.figure(figsize=(10, 6))
investment_avg = df.pivot_table(index=['InvestmentType'], values=['AmountInUSD'], aggfunc=np.mean)
fig = sns.barplot(x=investment_avg.index, y=investment_avg.AmountInUSD)
fig.set_title('Avg Investment based on Type')
fig.get_yaxis().get_major_formatter().set_scientific(False)
fig.get_yaxis().get_major_formatter().set_scientific(False)
for p in fig.patches:
    fig.annotate("{} $".format(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
plt.ticklabel_format(use_offset=False)

## How is the Investment trend going on ? Is it getting better ?
* Well seems like 2015 had made the most out of if it, after a severe drop in 2014 (more than half), 2017 is playing catching up so far.

In [None]:
plt.figure(figsize=(10, 5))
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
yearly_investment = df[['Year', 'AmountInUSD']].groupby('Year').sum()
fig = sns.pointplot(x=yearly_investment.index, y=yearly_investment['AmountInUSD'])
fig.set_title('Yearly Total Investment')
fig.get_yaxis().get_major_formatter().set_scientific(False)

## Was skeptical if month had anything to do with investment, well what the hell.
* Seems like investment usually is not at the start and the end of the year.
* It revolves around the financial year end either before or After Tax Filing i guess.

In [None]:
plt.figure(figsize=(10, 5))
fig = sns.violinplot(x=df.Month, y=df.AmountInUSD)
fig.set_title('Monthly Total Investment')
fig.get_yaxis().get_major_formatter().set_scientific(False)